Tags

,

This series talks about how to use Nutch and Solr to implement Google Search’s “Jump to” and Anchor links features. This article introduces how to use Nutch, HTML Parser Jsoup and Regular Expression to Extract Anchor Tag and Content
The Problem
In the search result, to help users easily jump to the section uses may be interested, we want to add anchor link below page description. Just like Google Search’s “Jump to” and Anchor links features.
Main Steps
1. Extract anchor tag, text and content in Nutch
Please refer to
Using Nutch to Extract Anchor Tag and Content
Using HTML Parser Jsoup and Regex to Extract Text between Tow Tags
Debugging and Optimizing Regular Expression
2. Using UpdateRequestProcessor to Store Anchor Tag and Content into Solr
3. Using Solr DocTransformer to Add Anchor Tag and Content into Response
This is described in current article.

Task: Using Solr DocTransformer to Add Anchor Tag and Content into Response
In previous article, we have used Nutch to extract anchor tag, text and content from web page, and saved content into Solr as separate docs with docType 1.

To return tag information for the web page that matches the query, we can use Solr DocTransformer to add fields into response.

AnchorTransformerFactory
DocTransformer is very powerful and useful, allows us to add/remove or update fields before returning. But it has one limit: it can only add one field, and the field name must be [transformer_name].


AnchorTransformer adds tow fields anchorTag, anchorText into SolrDocument. If we just use fl=[anchors], the response would not contains these fields. We have to use fl=[anchors],anchorTag,anchorText. The anchorTag,anchorText would tell Solr to add them into SolrReturnFields. Please refer the code at SolrReturnFields.add(String, NamedList<String>, DocTransformers, SolrQueryRequest).

public class AnchorTransformerFactory extends TransformerFactory {
  
  private String defaultSort;
  private int defaultAnchorRows = 5;
  private static final String SORT_BY_ORDER = "order";
  protected static Logger logger = LoggerFactory
      .getLogger(AnchorTransformerFactory.class);
  public void init(NamedList args) {
    super.init(args);
    Object obj = args.get("sort");
    if (obj != null) {
      defaultSort = (String) obj;
    }
    obj = args.get("anchorRows");
    if (obj != null) {
      defaultAnchorRows = Integer.parseInt(obj.toString());
    }
  }
  @Override
  public DocTransformer create(String field, SolrParams params,
      SolrQueryRequest req) {
    String sort = defaultSort;
    if (!StringUtils.isBlank(params.get("sort"))) {
      sort = params.get("sort");
    }
    int anchorRows = defaultAnchorRows;
    if (StringUtils.isNotBlank(params.get("anchorRows"))) {
      anchorRows = Integer.parseInt(params.get("anchorRows"));
    }
    return new AnchorTransformer(field, req, sort, anchorRows);
  }
  
  private static class AnchorTransformer extends DocTransformer {
    private SolrQueryRequest req;
    private String sort;
    private int anchorRows;
    
    public AnchorTransformer(String field, SolrQueryRequest req, String sort,
        int anchorRows) {
      this.req = req;
      this.sort = sort;
      this.anchorRows = anchorRows;
    }
    
    @Override
    public void transform(SolrDocument doc, int docid) throws IOException {
      String oldQuery = req.getParams().get(CommonParams.Q);
      Object idObj = doc.getFieldValue("contentid");
      
      // java.lang.RuntimeException: When this is called? obj.type:class
      // org.apache.lucene.document.LazyDocument$LazyField at
      String id;
      if (idObj instanceof org.apache.lucene.document.Field) {
        org.apache.lucene.document.Field field = (Field) idObj;
        id = field.stringValue();
      } else if (idObj instanceof IndexableField) {
        IndexableField field = (IndexableField) idObj;
        id = field.stringValue();
      } else {
        throw new RuntimeException("When this is called? obj.type:"
            + idObj.getClass());
      }
      SolrQuery query = new SolrQuery();
      query
          .setQuery(
              "anchorContent:" + ClientUtils.escapeQueryChars(oldQuery)
                  + " AND url: " + ClientUtils.escapeQueryChars(id))
          .addFilterQuery("docType:1").setRows(anchorRows)
          .setFields("anchorTag", "anchorText");
      if (SORT_BY_ORDER.equals(sort)) {
        query.setSort("anchorOrder", ORDER.asc);
      }
      // else default, sort by score
      List<Map<String,String>> anchorMap = extractSingleFieldValues(
          req.getCore(), "/select", query, "anchorTag", "anchorText");
      for (Map<String,String> map : anchorMap) {
        doc.addField("anchorTag", map.get("anchorTag"));
        doc.addField("anchorText", map.get("anchorText"));
      }
    }
    
  public static List<Map<String,String>> extractSingleFieldValues(
      SolrCore core, String handlerName, SolrQuery query, String... fls)
      throws IOException {
    SolrRequestHandler requestHandler = core.getRequestHandler(handlerName);
    query.setFields(fls);
    SolrQueryRequest newReq = new LocalSolrQueryRequest(core, query);
    try {
      SolrQueryResponse queryRsp = new SolrQueryResponse();
      requestHandler.handleRequest(newReq, queryRsp);
      return extractSingleFieldValues(newReq, queryRsp, fls);
    } finally {
      newReq.close();
    }
  }
  
  @SuppressWarnings("rawtypes")
  public static List<Map<String,String>> extractSingleFieldValues(
      SolrQueryRequest newReq, SolrQueryResponse newRsp, String[] fls)
      throws IOException {
    List<Map<String,String>> rst = new ArrayList<Map<String,String>>();
    NamedList contentIdNL = newRsp.getValues();
    
    Object rspObj = contentIdNL.get("response");
    SolrIndexSearcher searcher = newReq.getSearcher();    
    if (rspObj instanceof ResultContext) {
      ResultContext resultContext = (ResultContext) rspObj;
      DocList doclist = resultContext.docs;
      DocIterator dit = doclist.iterator();
      while (dit.hasNext()) {
        int docid = dit.nextDoc();
        Document doc = searcher.doc(docid, new HashSet<String>());
        Map<String,String> row = new HashMap<String,String>();
        for (String fl : fls) {
          row.put(fl, doc.get(fl));
        }
        rst.add(row);
      }
    } else if (rspObj instanceof SolrDocumentList) {
      SolrDocumentList docList = (SolrDocumentList) rspObj;
      Iterator<SolrDocument> docIt = docList.iterator();
      while (docIt.hasNext()) {
        SolrDocument doc = docIt.next();
        docIt.remove();
        Map<String,String> row = new HashMap<String,String>();
        for (String fl : fls) {
          Object tmp = doc.getFieldValue(fl);
          if (tmp != null) {
            row.put(fl, tmp.toString());
          }
        }
        rst.add(row);
      }
    }
    return rst;
  }    
  } 
}

SolrConfig.xml

  <transformer name="anchors" class="AnchorTransformerFactory" >
    <int name="anchorRows">5</int>
  </transformer>
  <requestHandler name="/select" class="solr.SearchHandler"
		default="true">  
      <lst name="defaults">
          <str name="fl">otherfields,[anchors],anchorTag,anchorText</str>
       </lst>
   </requestHandler>

Resource
Using UpdateRequestProcessor to Store Anchor Tag and Content into Solr
Using Nutch to Extract Anchor Tag and Content
Using HTML Parser Jsoup and Regex to Extract Text between Tow Tags
Debugging and Optimizing Regular Expression

via Blogger

Advertisements