手工编写爬虫

1. 纯Java版本

import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;

public class Exec {
	public static void main(String[] args) {
		try {
			URL url = new URL("https://www.baidu.com/");
			URLConnection url_con = url.openConnection();
			url_con.setDoOutput(true);
			url_con.setReadTimeout(10000);
			url_con.setConnectTimeout(10000);
			url_con
					.setRequestProperty(
							"User-AgentOne",
							"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 4.0; Trident/5.0; .NET CLR 3.5.12556)");
			Scanner in = new Scanner(url_con.getInputStream(), "UTF-8");
			StringBuffer sb = new StringBuffer();
			for (int n = 1; in.hasNextLine(); n++)
				sb.append(in.nextLine());
			System.out.println(sb.toString());
		} catch (Exception e) {
			System.out.println(e.getLocalizedMessage());
		}
	}
}

2. 增加伪造请求头和休眠周期

import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;

public class Exec {
	public static void main(String[] args) {
		try {
			boolean flag = false;
			URL url = new URL("https://www.baidu.com/");
			URLConnection url_con = url.openConnection();
			url_con.setDoOutput(true);
			url_con.setReadTimeout(10000);
			url_con.setConnectTimeout(10000);
			int randd = (int) (Math.random() * 10);
			if (randd < 3) url_con .setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 4.0; Trident/5.0; .NET CLR 3.5.12556)"); else if (randd >= 3 && randd < 5) url_con .setRequestProperty( "User-Agent", "Mozilla/4.5 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; .NET CLR 2.0.50727)"); else if (randd >= 5 && randd < 7)
				url_con
						.setRequestProperty("User-Agent",
								"Mozilla/5.1 (compatible; MSIE 9.0; Mac OS; Trident/4.0; .NET CLR 5.0.66711)");
			else
				url_con
						.setRequestProperty(
								"User-Agent",
								"Mozilla/6.0 (compatible; MSIE 8.1; Windows NT 7.1; Trident/6.0; .NET CLR 3.5.55992)");
			Scanner in = new Scanner(url_con.getInputStream(), "UTF-8");
			StringBuffer sb = new StringBuffer();
			for (int n = 1; in.hasNextLine(); n++)
				sb.append(in.nextLine());
			System.out.println(sb.toString());
			if (flag == false)
				Thread.sleep(1000 * (int) (Math.random() * 40));
			else {
				Thread.sleep(1000 * (int) (Math.random() * 10));
				flag = false;
			}
		} catch (Exception e) {
			System.out.println(e.getLocalizedMessage());
		}
	}
}

3. 基于HC的爬虫

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;

public class Exec {
	public static void main(String[] args) {
		HttpClient httpclient = new DefaultHttpClient();
		String startUrl = "https://www.baidu.com/";
		try {
			HttpGet httpget = new HttpGet(startUrl);
			HttpResponse response = httpclient.execute(httpget);
			HttpEntity entity = response.getEntity();
			if (entity == null) {
				System.out.println("Error" + httpget.getURI());
			} else {
				InputStream is = entity.getContent();
				BufferedReader br = new BufferedReader(new InputStreamReader(
						is, "UTF-8"));
				try {
					StringBuffer sb = new StringBuffer();
					String str = br.readLine();
					while (str != null) {
						sb.append(str + (char) 13 + (char) 10);
						str = br.readLine();
					}
					System.out.println(sb);
				} catch (Exception e) {
					httpget.abort();
				} finally {
					try {
						is.close();
					} catch (Exception ignore) {
					}
				}
				System.out.println("Success" + httpget.getURI());
			}
		} catch (Exception e) {
			System.out.println(e.getLocalizedMessage());
		} finally {
			httpclient.getConnectionManager().shutdown();
		}
	}
}

所需的Jar包下载:lib

发表评论

邮箱地址不会被公开。 必填项已用*标注