Tags

,

The Problem
Nutch and Nutch2 supports NTLM, Basic or Digest authentication to authenticate itself to websites. It doesn’t support Http Post Form Authentication.

Main Steps
Use Apache Http Client to do http post form authentication.
Make http post form authentication work.
Integrate http form authentication in Nutch2.

After previous two steps, now we can integrate http form authentication in Nutch2.
Define Http Form Post Authentication Properties in httpclient-auth.xml
Nutch uses http.auth.file to locate the xml file that defines credentials info, default value is httpclient-auth.xml. We extend httpclient-auth.xml to include information about http form authentication properties. The httpclient-auth.xml for the asp.net web application in last post is like below:

//<![CDATA[
if(showAdsense){
document.write("(adsbygoogle = window.adsbygoogle || []).push({});”)
}
//]]>

<?xml version="1.0"?>
<auth-configuration>
  <credentials authMethod="formAuth" loginUrl="http://localhost:44444/Account/Login.aspx" loginFormId="ctl01" loginRedirect="true">
    <loginPostData>
      <field name="ctl00$MainContent$LoginUser$UserName" value="admin"/>
      <field name="ctl00$MainContent$LoginUser$Password" value="admin123"/>
    </loginPostData>
    <removedFormFields>
      <field name="ctl00$MainContent$LoginUser$RememberMe"/>
    </removedFormFields>
  </credentials>
</auth-configuration>
Read Http Form Post Authentication from Configuration XML File
In Nutch’s http-client plugin, change org.apache.nutch.protocol.httpclient.Http.setCredentials() method to read authentication info into variable formConfigurer from configuration file.
Then change Http.resolveCredentials() method: if formConfigurer is not null, use HttpFormAuthentication to do form post login.
package org.apache.nutch.protocol.httpclient;
public class Http extends HttpBase {
 private void resolveCredentials(URL url) {
  if (formConfigurer != null) {
   HttpFormAuthentication formAuther = new HttpFormAuthentication(
     formConfigurer, client, this);
   try {
    formAuther.login();
   } catch (Exception e) {
    throw new RuntimeException(e);
   }
   return;
  }
  }
 private static synchronized void setCredentials()
   throws ParserConfigurationException, SAXException, IOException {

  if (authRulesRead)
   return;

  authRulesRead = true; // Avoid re-attempting to read
  InputStream is = conf.getConfResourceAsInputStream(authFile);
  if (is != null) {
   Document doc = DocumentBuilderFactory.newInstance()
     .newDocumentBuilder().parse(is);

   Element rootElement = doc.getDocumentElement();
   if (!"auth-configuration".equals(rootElement.getTagName())) {
    if (LOG.isWarnEnabled())
     LOG.warn("Bad auth conf file: root element <"
       + rootElement.getTagName() + "> found in "
       + authFile + " - must be <auth-configuration>");
   }

   // For each set of credentials
   NodeList credList = rootElement.getChildNodes();
   for (int i = 0; i < credList.getLength(); i++) {
    Node credNode = credList.item(i);
    if (!(credNode instanceof Element))
     continue;

    Element credElement = (Element) credNode;
    if (!"credentials".equals(credElement.getTagName())) {
     if (LOG.isWarnEnabled())
      LOG.warn("Bad auth conf file: Element <"
        + credElement.getTagName()
        + "> not recognized in " + authFile
        + " - expected <credentials>");
     continue;
    }
        // read http form post auth info
    String authMethod = credElement.getAttribute("authMethod");
    if (StringUtils.isNotBlank(authMethod)) {
     formConfigurer = readFormAuthConfigurer(credElement,
       authMethod);
     continue;
    }
      }
    }
  }
 private static HttpFormAuthConfigurer readFormAuthConfigurer(
   Element credElement, String authMethod) {
  if ("formAuth".equals(authMethod)) {
   HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer();

   String str = credElement.getAttribute("loginUrl");
   if (StringUtils.isNotBlank(str)) {
    formConfigurer.setLoginUrl(str.trim());
   } else {
    throw new IllegalArgumentException("Must set loginUrl.");
   }
   str = credElement.getAttribute("loginFormId");
   if (StringUtils.isNotBlank(str)) {
    formConfigurer.setLoginFormId(str.trim());
   } else {
    throw new IllegalArgumentException("Must set loginFormId.");
   }
   str = credElement.getAttribute("loginRedirect");
   if (StringUtils.isNotBlank(str)) {
    formConfigurer.setLoginRedirect(Boolean.parseBoolean(str));
   }

   NodeList nodeList = credElement.getChildNodes();
   for (int j = 0; j < nodeList.getLength(); j++) {
    Node node = nodeList.item(j);
    if (!(node instanceof Element))
     continue;

    Element element = (Element) node;
    if ("loginPostData".equals(element.getTagName())) {
     Map<String, String> loginPostData = new HashMap<String, String>();
     NodeList childNodes = element.getChildNodes();
     for (int k = 0; k < childNodes.getLength(); k++) {
      Node fieldNode = childNodes.item(k);
      if (!(fieldNode instanceof Element))
       continue;

      Element fieldElement = (Element) fieldNode;
      String name = fieldElement.getAttribute("name");
      String value = fieldElement.getAttribute("value");
      loginPostData.put(name, value);
     }
     formConfigurer.setLoginPostData(loginPostData);
    } else if ("additionalPostHeaders".equals(element.getTagName())) {
     Map<String, String> additionalPostHeaders = new HashMap<String, String>();
     NodeList childNodes = element.getChildNodes();
     for (int k = 0; k < childNodes.getLength(); k++) {
      Node fieldNode = childNodes.item(k);
      if (!(fieldNode instanceof Element))
       continue;

      Element fieldElement = (Element) fieldNode;
      String name = fieldElement.getAttribute("name");
      String value = fieldElement.getAttribute("value");
      additionalPostHeaders.put(name, value);
     }
     formConfigurer
       .setAdditionalPostHeaders(additionalPostHeaders);
    } else if ("removedFormFields".equals(element.getTagName())) {
     Set<String> removedFormFields = new HashSet<String>();
     NodeList childNodes = element.getChildNodes();
     for (int k = 0; k < childNodes.getLength(); k++) {
      Node fieldNode = childNodes.item(k);
      if (!(fieldNode instanceof Element))
       continue;

      Element fieldElement = (Element) fieldNode;
      String name = fieldElement.getAttribute("name");
      removedFormFields.add(name);
     }
     formConfigurer.setRemovedFormFields(removedFormFields);
    }
   }
   return formConfigurer;
  } else {
   throw new IllegalArgumentException("Unsupported authMethod: "
     + authMethod);
  }
 }  
}  
Resources

via Blogger http://ift.tt/MvVs1d

Advertisements