1. 纯Java版本
import java.net.URL; import java.net.URLConnection; import java.util.Scanner; public class Exec { public static void main(String[] args) { try { URL url = new URL("https://www.baidu.com/"); URLConnection url_con = url.openConnection(); url_con.setDoOutput(true); url_con.setReadTimeout(10000); url_con.setConnectTimeout(10000); url_con .setRequestProperty( "User-AgentOne", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 4.0; Trident/5.0; .NET CLR 3.5.12556)"); Scanner in = new Scanner(url_con.getInputStream(), "UTF-8"); StringBuffer sb = new StringBuffer(); for (int n = 1; in.hasNextLine(); n++) sb.append(in.nextLine()); System.out.println(sb.toString()); } catch (Exception e) { System.out.println(e.getLocalizedMessage()); } } }
2. 增加伪造请求头和休眠周期
import java.net.URL; import java.net.URLConnection; import java.util.Scanner; public class Exec { public static void main(String[] args) { try { boolean flag = false; URL url = new URL("https://www.baidu.com/"); URLConnection url_con = url.openConnection(); url_con.setDoOutput(true); url_con.setReadTimeout(10000); url_con.setConnectTimeout(10000); int randd = (int) (Math.random() * 10); if (randd < 3) url_con .setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 4.0; Trident/5.0; .NET CLR 3.5.12556)"); else if (randd >= 3 && randd < 5) url_con .setRequestProperty( "User-Agent", "Mozilla/4.5 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; .NET CLR 2.0.50727)"); else if (randd >= 5 && randd < 7) url_con .setRequestProperty("User-Agent", "Mozilla/5.1 (compatible; MSIE 9.0; Mac OS; Trident/4.0; .NET CLR 5.0.66711)"); else url_con .setRequestProperty( "User-Agent", "Mozilla/6.0 (compatible; MSIE 8.1; Windows NT 7.1; Trident/6.0; .NET CLR 3.5.55992)"); Scanner in = new Scanner(url_con.getInputStream(), "UTF-8"); StringBuffer sb = new StringBuffer(); for (int n = 1; in.hasNextLine(); n++) sb.append(in.nextLine()); System.out.println(sb.toString()); if (flag == false) Thread.sleep(1000 * (int) (Math.random() * 40)); else { Thread.sleep(1000 * (int) (Math.random() * 10)); flag = false; } } catch (Exception e) { System.out.println(e.getLocalizedMessage()); } } }
3. 基于HC的爬虫
import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; public class Exec { public static void main(String[] args) { HttpClient httpclient = new DefaultHttpClient(); String startUrl = "https://www.baidu.com/"; try { HttpGet httpget = new HttpGet(startUrl); HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); if (entity == null) { System.out.println("Error" + httpget.getURI()); } else { InputStream is = entity.getContent(); BufferedReader br = new BufferedReader(new InputStreamReader( is, "UTF-8")); try { StringBuffer sb = new StringBuffer(); String str = br.readLine(); while (str != null) { sb.append(str + (char) 13 + (char) 10); str = br.readLine(); } System.out.println(sb); } catch (Exception e) { httpget.abort(); } finally { try { is.close(); } catch (Exception ignore) { } } System.out.println("Success" + httpget.getURI()); } } catch (Exception e) { System.out.println(e.getLocalizedMessage()); } finally { httpclient.getConnectionManager().shutdown(); } } }
所需的Jar包下载:lib