View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.index;
20  
21  import javax.inject.Named;
22  import javax.inject.Singleton;
23  
24  import java.io.IOException;
25  import java.io.StringReader;
26  
27  import org.apache.lucene.analysis.TokenStream;
28  import org.apache.lucene.index.Term;
29  import org.apache.lucene.queryparser.classic.ParseException;
30  import org.apache.lucene.queryparser.classic.QueryParser;
31  import org.apache.lucene.queryparser.classic.QueryParser.Operator;
32  import org.apache.lucene.search.BooleanClause.Occur;
33  import org.apache.lucene.search.BooleanQuery;
34  import org.apache.lucene.search.BoostQuery;
35  import org.apache.lucene.search.PrefixQuery;
36  import org.apache.lucene.search.Query;
37  import org.apache.lucene.search.TermQuery;
38  import org.apache.lucene.search.WildcardQuery;
39  import org.apache.maven.index.context.NexusAnalyzer;
40  import org.apache.maven.index.creator.JarFileContentsIndexCreator;
41  import org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator;
42  import org.apache.maven.index.expr.SearchExpression;
43  import org.apache.maven.index.expr.SearchTyped;
44  import org.slf4j.Logger;
45  import org.slf4j.LoggerFactory;
46  
47  /**
48   * A default {@link QueryCreator} constructs Lucene query for provided query text.
49   * <p>
50   * By default wildcards are created such as query text matches beginning of the field value or beginning of the
51   * class/package name segment for {@link ArtifactInfo#NAMES NAMES} field. But it can be controlled by using special
52   * markers:
53   * <ul>
54   * <li>* - any character</li>
55   * <li>'^' - beginning of the text</li>
56   * <li>'$' or '&lt;' or ' ' end of the text</li>
57   * </ul>
58   * For example:
59   * <ul>
60   * <li>junit - matches junit and junit-foo, but not foo-junit</li>
61   * <li>*junit - matches junit, junit-foo and foo-junit</li>
62   * <li>^junit$ - matches junit, but not junit-foo, nor foo-junit</li>
63   * </ul>
64   *
65   * @author Eugene Kuleshov
66   */
67  @Singleton
68  @Named
69  public class DefaultQueryCreator implements QueryCreator {
70  
71      private final Logger logger = LoggerFactory.getLogger(getClass());
72  
73      protected Logger getLogger() {
74          return logger;
75      }
76  
77      // ==
78  
79      public IndexerField selectIndexerField(final Field field, final SearchType type) {
80          IndexerField lastField = null;
81  
82          for (IndexerField indexerField : field.getIndexerFields()) {
83              lastField = indexerField;
84  
85              if (type.matchesIndexerField(indexerField)) {
86                  return indexerField;
87              }
88          }
89  
90          return lastField;
91      }
92  
93      public Query constructQuery(final Field field, final SearchExpression expression) throws ParseException {
94          SearchType searchType = SearchType.SCORED;
95  
96          if (expression instanceof SearchTyped) {
97              searchType = ((SearchTyped) expression).getSearchType();
98          }
99  
100         return constructQuery(field, expression.getStringValue(), searchType);
101     }
102 
103     public Query constructQuery(final Field field, final String query, final SearchType type) throws ParseException {
104         if (type == null) {
105             throw new NullPointerException("Cannot construct query with type of \"null\"!");
106         }
107 
108         if (field == null) {
109             throw new NullPointerException("Cannot construct query for field \"null\"!");
110         } else {
111             return constructQuery(field, selectIndexerField(field, type), query, type);
112         }
113     }
114 
115     @Deprecated
116     public Query constructQuery(String field, String query) {
117         Query result;
118 
119         if (MinimalArtifactInfoIndexCreator.FLD_GROUP_ID_KW.getKey().equals(field)
120                 || MinimalArtifactInfoIndexCreator.FLD_ARTIFACT_ID_KW.getKey().equals(field)
121                 || MinimalArtifactInfoIndexCreator.FLD_VERSION_KW.getKey().equals(field)
122                 || JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.getKey().equals(field)) {
123             // these are special untokenized fields, kept for use cases like TreeView is (exact matching).
124             result = legacyConstructQuery(field, query);
125         } else {
126             QueryParser qp = new QueryParser(field, new NexusAnalyzer());
127 
128             // small cheap trick
129             // if a query is not "expert" (does not contain field:val kind of expression)
130             // but it contains star and/or punctuation chars, example: "common-log*"
131             if (!query.contains(":")) {
132                 if (query.contains("*") && query.matches(".*(\\.|-|_).*")) {
133                     query = query.toLowerCase()
134                             .replaceAll("\\*", "X")
135                             .replaceAll("\\.|-|_", " ")
136                             .replaceAll("X", "*");
137                 }
138             }
139 
140             try {
141                 result = qp.parse(query);
142             } catch (ParseException e) {
143                 getLogger()
144                         .debug("Query parsing with \"legacy\" method, we got ParseException from QueryParser: "
145                                 + e.getMessage());
146 
147                 result = legacyConstructQuery(field, query);
148             }
149         }
150 
151         if (getLogger().isDebugEnabled()) {
152             getLogger().debug("Query parsed as: " + result.toString());
153         }
154 
155         return result;
156     }
157 
158     // ==
159 
160     public Query constructQuery(
161             final Field field, final IndexerField indexerField, final String query, final SearchType type)
162             throws ParseException {
163         if (indexerField == null) {
164             getLogger()
165                     .warn("Querying for field \"" + field.toString() + "\" without any indexer field was tried. "
166                             + "Please review your code, and consider adding this field to index!");
167 
168             return null;
169         }
170         if (!indexerField.isIndexed()) {
171             getLogger()
172                     .warn("Querying for non-indexed field " + field.toString()
173                             + " was tried. Please review your code or consider adding this field to index!");
174 
175             return null;
176         }
177 
178         if (Field.NOT_PRESENT.equals(query)) {
179             return new WildcardQuery(new Term(indexerField.getKey(), "*"));
180         }
181 
182         if (SearchType.EXACT.equals(type)) {
183             if (indexerField.isKeyword()) {
184                 // no tokenization should happen against the field!
185                 if (query.contains("*") || query.contains("?")) {
186                     return new WildcardQuery(new Term(indexerField.getKey(), query));
187                 } else {
188                     // exactly what callee wants
189                     return new TermQuery(new Term(indexerField.getKey(), query));
190                 }
191             } else if (!indexerField.isKeyword() && indexerField.isStored()) {
192                 // TODO: resolve this better! Decouple QueryCreator and IndexCreators!
193                 // This is a hack/workaround here
194                 if (JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.equals(indexerField)) {
195                     if (query.startsWith("/")) {
196                         return new TermQuery(new Term(
197                                 indexerField.getKey(), query.toLowerCase().replaceAll("\\.", "/")));
198                     } else {
199                         return new TermQuery(new Term(
200                                 indexerField.getKey(), "/" + query.toLowerCase().replaceAll("\\.", "/")));
201                     }
202                 } else {
203                     getLogger()
204                             .warn(type.toString()
205                                     + " type of querying for non-keyword (but stored) field "
206                                     + indexerField.getOntology().toString()
207                                     + " was tried. Please review your code, or indexCreator involved, "
208                                     + "since this type of querying of this field is currently unsupported.");
209 
210                     // will never succeed (unless we supply him "filter" too, but that would kill performance)
211                     // and is possible with stored fields only
212                     return null;
213                 }
214             } else {
215                 getLogger()
216                         .warn(type.toString()
217                                 + " type of querying for non-keyword (and not stored) field "
218                                 + indexerField.getOntology().toString()
219                                 + " was tried. Please review your code, or indexCreator involved, "
220                                 + "since this type of querying of this field is impossible.");
221 
222                 // not a keyword indexerField, nor stored. No hope at all. Impossible even with "filtering"
223                 return null;
224             }
225         } else if (SearchType.SCORED.equals(type)) {
226             if (JarFileContentsIndexCreator.FLD_CLASSNAMES.equals(indexerField)) {
227                 String qpQuery = query.toLowerCase().replaceAll("\\.", " ").replaceAll("/", " ");
228                 // tokenization should happen against the field!
229                 QueryParser qp = new QueryParser(indexerField.getKey(), new NexusAnalyzer());
230                 qp.setDefaultOperator(Operator.AND);
231                 return qp.parse(qpQuery);
232             } else if (indexerField.isKeyword()) {
233                 // no tokenization should happen against the field!
234                 if (query.contains("*") || query.contains("?")) {
235                     return new WildcardQuery(new Term(indexerField.getKey(), query));
236                 } else {
237                     Term t = new Term(indexerField.getKey(), query);
238                     return new BooleanQuery.Builder()
239                             .add(new TermQuery(t), Occur.SHOULD)
240                             .add(new BoostQuery(new PrefixQuery(t), 0.8f), Occur.SHOULD)
241                             .build();
242                 }
243             } else {
244                 // to save "original" query
245                 String qpQuery = query;
246 
247                 // tokenization should happen against the field!
248                 QueryParser qp = new QueryParser(indexerField.getKey(), new NexusAnalyzer());
249                 qp.setDefaultOperator(Operator.AND);
250 
251                 // small cheap trick
252                 // if a query is not "expert" (does not contain field:val kind of expression)
253                 // but it contains star and/or punctuation chars, example: "common-log*"
254                 // since Lucene does not support multi-terms WITH wildcards.
255                 // So, here, we "mimic" NexusAnalyzer (this should be fixed!)
256                 // but do this with PRESERVING original query!
257                 if (qpQuery.matches(".*(\\.|-|_|/).*")) {
258                     qpQuery = qpQuery.toLowerCase()
259                             .replaceAll("\\*", "X")
260                             .replaceAll("\\.|-|_|/", " ")
261                             .replaceAll("X", "*")
262                             .replaceAll(" \\* ", "")
263                             .replaceAll("^\\* ", "")
264                             .replaceAll(" \\*$", "");
265                 }
266 
267                 // "fix" it with trailing "*" if not there, but only if it not ends with a space
268                 if (!qpQuery.endsWith("*") && !qpQuery.endsWith(" ")) {
269                     qpQuery += "*";
270                 }
271 
272                 try {
273                     // qpQuery = "\"" + qpQuery + "\"";
274 
275                     BooleanQuery.Builder q1b = new BooleanQuery.Builder().add(qp.parse(qpQuery), Occur.SHOULD);
276 
277                     if (qpQuery.contains(" ")) {
278                         q1b.add(qp.parse("\"" + qpQuery + "\""), Occur.SHOULD);
279                     }
280 
281                     Query q2 = null;
282 
283                     int termCount = countTerms(indexerField, query);
284 
285                     // try with KW only if the processed query in qpQuery does not have spaces!
286                     if (!query.contains(" ") && termCount > 1) {
287                         // get the KW field
288                         IndexerField keywordField = selectIndexerField(indexerField.getOntology(), SearchType.EXACT);
289 
290                         if (keywordField.isKeyword()) {
291                             q2 = constructQuery(indexerField.getOntology(), keywordField, query, type);
292                         }
293                     }
294 
295                     if (q2 == null) {
296                         return q1b.build();
297                     } else {
298                         return new BooleanQuery.Builder()
299                                 // trick with order
300                                 .add(q2, Occur.SHOULD)
301                                 .add(q1b.build(), Occur.SHOULD)
302                                 .build();
303                     }
304                 } catch (ParseException e) {
305                     // TODO: we are not falling back anymore to legacy!
306                     throw e;
307 
308                     // getLogger().debug(
309                     // "Query parsing with \"legacy\" method, we got ParseException from QueryParser: "
310                     // + e.getMessage() );
311                     //
312                     // return legacyConstructQuery( indexerField.getKey(), query );
313                 }
314             }
315         } else {
316             // what search type is this?
317             return null;
318         }
319     }
320 
321     public Query legacyConstructQuery(String field, String query) {
322         if (query == null || query.length() == 0) {
323             getLogger().info("Empty or null query for field:" + field);
324 
325             return null;
326         }
327 
328         String q = query.toLowerCase();
329 
330         char h = query.charAt(0);
331 
332         if (JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.getKey().equals(field)
333                 || JarFileContentsIndexCreator.FLD_CLASSNAMES.getKey().equals(field)) {
334             q = q.replaceAll("\\.", "/");
335 
336             if (h == '^') {
337                 q = q.substring(1);
338 
339                 if (q.charAt(0) != '/') {
340                     q = '/' + q;
341                 }
342             } else if (h != '*') {
343                 q = "*/" + q;
344             }
345         } else {
346             if (h == '^') {
347                 q = q.substring(1);
348             } else if (h != '*') {
349                 q = "*" + q;
350             }
351         }
352 
353         int l = q.length() - 1;
354         char c = q.charAt(l);
355         if (c == ' ' || c == '<' || c == '$') {
356             q = q.substring(0, q.length() - 1);
357         } else if (c != '*') {
358             q += "*";
359         }
360 
361         int n = q.indexOf('*');
362         if (n == -1) {
363             return new TermQuery(new Term(field, q));
364         } else if (n > 0 && n == q.length() - 1) {
365             return new PrefixQuery(new Term(field, q.substring(0, q.length() - 1)));
366         }
367 
368         return new WildcardQuery(new Term(field, q));
369     }
370 
371     // ==
372 
373     private NexusAnalyzer nexusAnalyzer = new NexusAnalyzer();
374 
375     protected int countTerms(final IndexerField indexerField, final String query) {
376         try {
377             TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query));
378             ts.reset();
379 
380             int result = 0;
381 
382             while (ts.incrementToken()) {
383                 result++;
384             }
385 
386             ts.end();
387             ts.close();
388 
389             return result;
390         } catch (IOException e) {
391             // will not happen
392             return 1;
393         }
394     }
395 }