網路爬蟲 (簡易版)
package ccc; import java.util.regex.*; import java.util.*; import java.io.*; import java.net.*; public class Spider { TreeMap urlMap = new TreeMap(); public static void main(String[] args) throws Exception { Spider spider = new Spider(); NET.setProxy("proxy.internal", "3128"); // 585 for (int i=1; i<=585; i++) { String url = "http://sourceforge.net/softwaremap/trove_list.php?form_cat=198&page="+i; spider.craw(url, "http://sourceforge.net/projects/", 1); } System.out.println(); String urlList = UTIL.array2text(spider.urlMap.keySet().toArray(), "\n"); STR.text2file(urlList, "spider\\sourceforge.lst"); // spider.craw("http://www.km.kuas.edu.tw/", "http://www.km.kuas.edu.tw/", 4); // spider.craw("http://www.km.kuas.edu.tw/kmit/teachteam/", "http://www.km.kuas.edu.tw/", 4); } boolean isTextUrl(String pUrl) { String lUrl = pUrl.toLowerCase(); String last = STR.last(lUrl, "/"); if (last.indexOf(".") < 0) return true; if (lUrl.indexOf("?") > 0) return true; if (lUrl.indexOf(".xml") > 0) return true; if (lUrl.indexOf(".htm") > 0) return true; if (lUrl.indexOf(".jsp") > 0) return true; if (lUrl.indexOf(".cgi") > 0) return true; if (lUrl.indexOf(".asp") > 0) return true; if (lUrl.indexOf(".php") > 0) return true; return false; } void craw(String pUrl, String pDomain, int pDepth) throws Exception { if (pDepth <= 0) return; String text = NET.url2text(pUrl); Vector textUrls = NET.html2urls(text); System.out.println("craw url="+pUrl); if (!NET.redirectUrl.equals(pUrl)) System.out.println("->redirectUrl = "+NET.redirectUrl); for (int ui=0; ui<textUrls.size(); ui++) { String fullUrl = NET.fullUrl(NET.redirectUrl, textUrls.get(ui).toString()); if (fullUrl == null) continue; fullUrl = STR.noLast(fullUrl, "#"); if (!isTextUrl(fullUrl)) continue; if (fullUrl.startsWith(pDomain)) { if (urlMap.get(fullUrl) == null) { urlMap.put(fullUrl, fullUrl); craw(fullUrl, pDomain, pDepth-1); } } } } }
沒有留言:
張貼留言