Tags

,

The Problem
Nutch and Nutch2 supports NTLM, Basic or Digest authentication to authenticate itself to websites. It doesn’t support Http Post Form Authentication.

Main Steps
Use Apache Http Client to do http post form authentication.
Test http post form authentication.
Integrate with Nutch2.
Use Apache Http Client to Do Http Post Form Authentication
HttpFormAuthConfigurer
First let’s check the HttpFormAuthConfigurer class. No need to explain loginUrl and loginFormId. loginPostData stores the field name and value for login fields, such as username:user1, passowrd:password1. removedFormFields told us input field we want to remove, additionalPostHeaders is uesed when we have to add addtional header name and value when do post form login. if loginRedirect is true, and http post login returns redirect code: 301 or 302, Http Client will automatically follow the redirect.

package org.apache.nutch.protocol.httpclient;
public class HttpFormAuthConfigurer {
	private String loginUrl;
	private String loginFormId;
	private Map<String, String> loginPostData;
	private Set<String> removedFormFields;	
	private Map<String, String> additionalPostHeaders;
	private boolean loginRedirect;
}	

HttpFormAuthentication 
In login method, it first calls CookieHandler.setDefault(new CookieManager()); so if login succeeds, subsequent request would not require login again.

Then it sends a http get request to the loginUrl, uses Jsoup.parse(pageContent) to parse the response, iterates all input fields in the login form, adds all field names and values into List params, sets values for username and password fields which are stored in loginPostData, we may also have to remove some form fields(in removedFormFields). Then send a post request to the loginUrl with data: List params.

The following code uses Apache Http Client 3.x, as Nutch2 still uses the pretty old http client library.

package org.apache.nutch.protocol.httpclient;

public class HttpFormAuthentication {
	private static final Logger LOGGER = LoggerFactory
			.getLogger(HttpFormAuthentication.class);
	private static Map<String, String> defaultLoginHeaders = new HashMap<String, String>();
	static {
		defaultLoginHeaders.put("User-Agent", "Mozilla/5.0");
		defaultLoginHeaders
				.put("Accept",
						"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
		defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5");
		defaultLoginHeaders.put("Connection", "keep-alive");
		defaultLoginHeaders.put("Content-Type",
				"application/x-www-form-urlencoded");
	}

	private HttpClient client;
	private HttpFormAuthConfigurer authConfigurer = new HttpFormAuthConfigurer();
	private String cookies;

	public HttpFormAuthentication(HttpFormAuthConfigurer authConfigurer) {
		this.authConfigurer = authConfigurer;
		this.client = new HttpClient();
	}
	public HttpFormAuthentication(HttpFormAuthConfigurer authConfigurer,
			HttpClient client, Http http) {
		this.authConfigurer = authConfigurer;
		this.client = client;
		defaultLoginHeaders.put("Accept", http.getAccept());
		defaultLoginHeaders.put("Accept-Language", http.getAcceptLanguage());
		defaultLoginHeaders.put("User-Agent", http.getUserAgent());
	}
	public void login() throws Exception {
		// make sure cookies is turn on
		CookieHandler.setDefault(new CookieManager());
		String pageContent = httpGetPageContent(authConfigurer.getLoginUrl());
		List<NameValuePair> params = getLoginFormParams(pageContent);
		sendPost(authConfigurer.getLoginUrl(), params);
	}

	private void sendPost(String url, List<NameValuePair> params)
			throws Exception {
		PostMethod post = null;
		try {
			if (authConfigurer.isLoginRedirect()) {
				post = new PostMethod(url) {
					@Override
					public boolean getFollowRedirects() {
						return true;
					}
				};
			} else {
				post = new PostMethod(url);
			}
			// we can't use post.setFollowRedirects(true) as it will throw
			// IllegalArgumentException:
			// Entity enclosing requests cannot be redirected without user
			// intervention
			setLoginHeader(post);
			post.addParameters(params.toArray(new NameValuePair[0]));
			// post.setEntity(new UrlEncodedFormEntity(postParams));

			int rspCode = client.executeMethod(post);
			if (LOGGER.isDebugEnabled()) {
				LOGGER.info("rspCode: " + rspCode);
				LOGGER.info("\nSending 'POST' request to URL : " + url);

				LOGGER.info("Post parameters : " + params);
				LOGGER.info("Response Code : " + rspCode);

				for (Header header : post.getRequestHeaders()) {
					LOGGER.info("Response headers : " + header);
				}
			}
			String rst = IOUtils.toString(post.getResponseBodyAsStream());
			LOGGER.debug("login post result: " + rst);
		} finally {
			if (post != null) {
				post.releaseConnection();
			}
		}
	}

	private void setLoginHeader(PostMethod post) {
		Map<String, String> headers = new HashMap<String, String>();
		headers.putAll(defaultLoginHeaders);
		// additionalPostHeaders can overwrite value in defaultLoginHeaders
		headers.putAll(authConfigurer.getAdditionalPostHeaders());
		for (Entry<String, String> entry : headers.entrySet()) {
			post.addRequestHeader(entry.getKey(), entry.getValue());
		}
		post.addRequestHeader("Cookie", getCookies());
	}

	private String httpGetPageContent(String url) throws IOException {

		GetMethod get = new GetMethod(url);
		try {
			for (Entry<String, String> entry : authConfigurer
					.getAdditionalPostHeaders().entrySet()) {
				get.addRequestHeader(entry.getKey(), entry.getValue());
			}
			client.executeMethod(get);
      
			Header cookieHeader = get.getResponseHeader("Set-Cookie");
			if (cookieHeader != null) {
				setCookies(cookieHeader.getValue());
			}
			return IOUtils.toString(get.getResponseBodyAsStream());
		} finally {
			get.releaseConnection();
		}
	}

	private List<NameValuePair> getLoginFormParams(String pageContent)
			throws UnsupportedEncodingException {
		List<NameValuePair> params = new ArrayList<NameValuePair>();
		Document doc = Jsoup.parse(pageContent);
		Element loginform = doc.getElementById(authConfigurer.getLoginFormId());
		if (loginform == null) {
			throw new IllegalArgumentException("No form exists: "
					+ authConfigurer.getLoginFormId());
		}
		Elements inputElements = loginform.getElementsByTag("input");

		// skip fields in removedFormFields or loginPostData
		for (Element inputElement : inputElements) {
			String key = inputElement.attr("name");
			String value = inputElement.attr("value");
			if (authConfigurer.getLoginPostData().containsKey(key)
					|| authConfigurer.getRemovedFormFields().contains(key)) {
				continue;
			}
			params.add(new NameValuePair(key, value));
		}
		// add key and value in loginPostData
		for (Entry<String, String> entry : authConfigurer.getLoginPostData()
				.entrySet()) {
			params.add(new NameValuePair(entry.getKey(), entry.getValue()));
		}
		return params;
	}
}

Http Form Authentication in Apache Http Client 4.x

public class HttpCilentFormLoginExample {
  private static final Logger LOGGER = LoggerFactory
      .getLogger(HttpCilentFormLoginExample.class);
  private DefaultHttpClient client = new DefaultHttpClient();
  private String loginUrl, loginForm;  
  private static Map<String,String> defaultLoginHeaders = new HashMap<String,String>();  
  static {
    defaultLoginHeaders.put("User-Agent", "Mozilla/5.0");
    defaultLoginHeaders.put("Accept",
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5");
    defaultLoginHeaders.put("Connection", "keep-alive");
    // defaultLoginHeaders.put("Referer",
    // "http://ift.tt/18U6d7n");
    defaultLoginHeaders
        .put("Content-Type", "application/x-www-form-urlencoded");
  }
  private Map<String,String> loginPostData;
  private Map<String,String> additionalPostHeaders;
  private Set<String> removedFormFields;
  private String cookies;
  
  public HttpCilentFormLoginExample(String loginUrl, String loginForm,
      Map<String,String> loginPostData,
      Map<String,String> additionalPostHeaders, Set<String> removedFormFields) {
    this.loginUrl = loginUrl;
    this.loginForm = loginForm;
    this.loginPostData = loginPostData == null ? new HashMap<String,String>()
        : loginPostData;
    this.additionalPostHeaders = additionalPostHeaders == null ? new HashMap<String,String>()
        : additionalPostHeaders;
    this.removedFormFields = removedFormFields == null ? new HashSet<String>()
        : removedFormFields;
  }
    
  public void login() throws Exception, UnsupportedEncodingException {
    client.setRedirectStrategy(new LaxRedirectStrategy());
    // make sure cookies is turn on
    CookieHandler.setDefault(new CookieManager());
    String pageContent = httpGetPageContent(loginUrl);
    List<NameValuePair> postParams = getLoginFormParams(pageContent);
    sendPost(loginUrl, postParams);
  }
  
  private void sendPost(String url, List<NameValuePair> postParams)
      throws Exception {
    HttpPost post = new HttpPost(url);
    try {
      setLoginHeader(post);
      post.setEntity(new UrlEncodedFormEntity(postParams));      
      HttpResponse response = client.execute(post);      
      int responseCode = response.getStatusLine().getStatusCode();
      if (LOGGER.isDebugEnabled()) {
        LOGGER.info("rspCode: " + responseCode);
        LOGGER.info("\nSending 'POST' request to URL : " + url);
        LOGGER.info("Post parameters : " + postParams);
        for (Header header : response.getAllHeaders()) {
          LOGGER.info("Response headers : " + header);
        }
      }
      String rst = IOUtils.toString(response.getEntity().getContent());
      LOGGER.debug("login post result: " + rst);
    } finally {
      post.releaseConnection();
    }
  }
  
  private void setLoginHeader(HttpPost post) {
    Map<String,String> headers = new HashMap<String,String>();
    headers.putAll(defaultLoginHeaders);
    // additionalPostHeaders can overwrite value in defaultLoginHeaders
    headers.putAll(additionalPostHeaders);
    for (Entry<String,String> entry : headers.entrySet()) {
      post.setHeader(entry.getKey(), entry.getValue());
    }
    post.setHeader("Cookie", getCookies());
  }
  
  private String httpGetPageContent(String url) throws IOException {    
    HttpGet get = new HttpGet(url);
    try {
      for (Entry<String,String> entry : additionalPostHeaders.entrySet()) {
        get.setHeader(entry.getKey(), entry.getValue());
      }
      HttpResponse response = client.execute(get);
      setCookies(response.getFirstHeader("Set-Cookie") == null ? "" : response
          .getFirstHeader("Set-Cookie").toString());
      return IOUtils.toString(response.getEntity().getContent());
    } finally {
      get.releaseConnection();
    }    
  }
  
  private List<NameValuePair> getLoginFormParams(String pageContent)
      throws UnsupportedEncodingException {
    Document doc = Jsoup.parse(pageContent);
    List<NameValuePair> paramList = new ArrayList<NameValuePair>();
    Element loginform = doc.getElementById(loginForm);
    if (loginform == null) {
      throw new IllegalArgumentException("No form exists: " + loginForm);
    }
    Elements inputElements = loginform.getElementsByTag("input");
    // skip fields in removedFormFields or loginPostData
    for (Element inputElement : inputElements) {
      String key = inputElement.attr("name");
      String value = inputElement.attr("value");
      if (loginPostData.containsKey(key) || removedFormFields.contains(key)) {
        continue;
      }
      paramList.add(new BasicNameValuePair(key, value));
    }
    // add key and value in loginPostData
    for (Entry<String,String> entry : loginPostData.entrySet()) {
      paramList.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
    }
    return paramList;
  }
}
Resources
Cookie Handling in Java SE 6
Apache HttpClient – Automate login Google

via Blogger http://ift.tt/1jB4FBu

Advertisements