Tags

,

The Problem
We use Nutch to crawl web sites and save the content into Solr for search.

A website usally applies a template which defines header, footer, navigation menu. Take one faked storage related documentation site as an example. The word storage appears multiple times in header, footer and menus.
The website has some very simple contact-us or login page. Because there is only a few content in these pages, it’s very likely if user search storage, these 2 pages would be ranked highly and listed in first page.

We want to save main content in one field in Solr, and boost that field.
The Solution
Change Nutch to Send Raw Content to Solr
By default, Nutch sends Solr the html tag stripped content to solr, not the rawl html page content.
To send raw content to Solr, we have to create one extra nutch plugin:

//<![CDATA[
if(showAdsense){
document.write("(adsbygoogle = window.adsbygoogle || []).push({});”)
}
//]]>

public class ExtraIndexingFilter implements IndexingFilter {
  public static final String FL_RAWCONTENT = "rawcontent";
  private Configuration conf;
  private boolean indexRawContent;
  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

  static {
    FIELDS.add(WebPage.Field.CONTENT);
  }
  public NutchDocument filter(NutchDocument doc, String url, WebPage page)
      throws IndexingException {
    if (indexRawContent) {
      ByteBuffer bb = page.getContent();
      if (bb != null) {
        doc.add(FL_RAWCONTENT, new String(bb.array()));
      }
    }
    return doc;
  }
  public void setConf(Configuration conf) {
    this.conf = conf;
    indexRawContent = conf.getBoolean("index-extra.rawcontent", false);
  }
}

Then change nutch-site.xml, add the plugin(index-extra) in plugin.includes. 
Set extra-index.rawcontent to true, and set http.content.limit to -1, so Nutch will crawl whole page.

<property>
  <name>extra-index.rawcontent</name>
  <value>true</value>
</property>
<property>
	<name>http.content.limit</name>
	<value>-1</value>
</property>

In solrindex-mapping.xml, add:

<field dest="rawcontent" source="rawcontent" />

Using boilerpipe to Remove Boilerplate Content in Solr
Next, we will define one Solr update processor which will use boilerpipe to remove the surplus “clutter” (boilerplate, templates).
BoilerpipeProcessor will use boilerpipe to remove boilerplate from originalfield and save stripped main content into strippedField. 

import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.ArticleExtractor;

public class BoilerpipeProcessorFactory extends UpdateRequestProcessorFactory {
  private static final Logger logger = LoggerFactory
      .getLogger(BoilerpipeProcessorFactory.class);
  private boolean enabled = true;
  private String originfield, strippedField;
  private boolean removeOriginfield = true;

  public void init(NamedList args) {
    super.init(args);
    if (args != null) {
      SolrParams params = SolrParams.toSolrParams(args);
      enabled = params.getBool("enabled", true);
      if (!enabled) return;
      removeOriginfield = params.getBool("removeOriginfield", true);
      originfield = Preconditions.checkNotNull(params.get("originfield"),
          "Must set originfield.");
      
      strippedField = Preconditions.checkNotNull(params.get("strippedField"),
          "Must set strippedField.");
    }
  }
  public UpdateRequestProcessor getInstance(SolrQueryRequest req,
      SolrQueryResponse rsp, UpdateRequestProcessor next) {
    if (!enabled) return null;
    return new BoilerpipeProcessor(next);
  }
  
  private class BoilerpipeProcessor extends UpdateRequestProcessor {
    public BoilerpipeProcessor(UpdateRequestProcessor next) {
      super(next);
    }
    public void processAdd(AddUpdateCommand cmd) throws IOException {
      SolrInputDocument doc = cmd.solrDoc;
      Collection<Object> colls = doc.getFieldValues(originfield);
      if (colls != null) {
        for (Object obj : colls) {
          if (obj != null) {
            String str = obj.toString();
            try {
              String strippedText = ArticleExtractor.getInstance().getText(str);
              doc.addField(strippedField, strippedText);
              if (removeOriginfield) {
                doc.removeField(originfield);
              }
            } catch (BoilerpipeProcessingException e) {
              logger.error("Error happened when use boilerpipe to strip text.",
                  e);
            }
          }
        }
      }
      super.processAdd(cmd);
    }
  }
}

Add the processor into the default chain in the solrconfig.xml:

<updateRequestProcessorChain name="defaultChain" default="true`">
  <processor
			class="org.lifelongprogrammer.BoilerpipeProcessorFactory">
    <bool name="enabled">true</bool>
    <str name="originfield">rawcontent</str>
    <str name="strippedField">main_content</str>
    <bool name="removeOriginfield">true</bool>
  </processor>
  <processor class="solr.LogUpdateProcessorFactory" />
  <processor class="solr.RunUpdateProcessorFactory" />	
</updateRequestProcessorChain>

Add main_content field in schema.xml:

<field name="main_content" type="text_rev" indexed="true" stored="true"  omitNorms="false" />

After all this, we can change our search handler to boost on main_content field:

<requestHandler name="/select" class="solr.SearchHandler" default="true">
  <lst name="defaults">
    <!-- Omitted -->
    <str name="qf">main_content^10 body_stored</str>
  </lst>
</requestHandler>

Resources
boilerpipe library

via Blogger

Advertisements