知识大全 Java抓图程序的实现
Posted 知
篇首语:花门楼前见秋草,岂能贫贱相看老。本文由小常识网(cha138.com)小编为大家整理,主要介绍了知识大全 Java抓图程序的实现相关的知识,希望对你有一定的参考价值。
Java抓图程序的实现 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!
主要难点:
并发线程的控制 采用了JDK 的UTIL包里的concurrent子包
去重
序列化
运行方法:java Xms M Xmx M jar JavaCrawler jar C:/a log D:/pic D:/url tmp D:/img tmp
SimpleBloomFilter java
view plaincopy to clipboardprint?
········· ········ ········ ········ ········ ········ ········ ········ ········ ········ ······· ······· ······· ······· ······· package hengking crawl;
import java io Serializable;
import java util BitSet;
public class SimpleBloomFilter implements Serializable
/**
*
*/
private static final long serialVersionUID = L;
private final int DEFAULT_SIZE = << ;
private final int[] seeds = new int[] ;
private BitSet bits = new BitSet(DEFAULT_SIZE);
private SimpleHash[] func = new SimpleHash[seeds length];
// public void main(String[] args)
// String value = ;
// SimpleBloomFilter filter = new SimpleBloomFilter();
// System out println(ntains(value));
// filter add(value);
// System out println(ntains(value));
//
public SimpleBloomFilter()
for (int i = ; i < seeds length; i++)
func[i] = new SimpleHash(DEFAULT_SIZE seeds[i]);
public void add(String value)
for (SimpleHash f : func)
bits set(f hash(value) true);
public boolean contains(String value)
if (value == null)
return false;
boolean ret = true;
for (SimpleHash f : func)
ret = ret && bits get(f hash(value));
return ret;
public class SimpleHash implements Serializable
private int cap;
private int seed;
public SimpleHash(int cap int seed)
this cap = cap;
this seed = seed;
public int hash(String value)
int result = ;
int len = value length();
for (int i = ; i < len; i++)
result = seed * result + value charAt(i);
return (cap ) & result;
@Override
public String toString()
// TODO Auto generated method stub
return super toString();
package hengking crawl;
import java io Serializable;
import java util BitSet;
public class SimpleBloomFilter implements Serializable
/**
*
*/
private static final long serialVersionUID = L;
private final int DEFAULT_SIZE = << ;
private final int[] seeds = new int[] ;
private BitSet bits = new BitSet(DEFAULT_SIZE);
private SimpleHash[] func = new SimpleHash[seeds length];
// public void main(String[] args)
// String value = ;
// SimpleBloomFilter filter = new SimpleBloomFilter();
// System out println(ntains(value));
// filter add(value);
// System out println(ntains(value));
//
public SimpleBloomFilter()
for (int i = ; i < seeds length; i++)
func[i] = new SimpleHash(DEFAULT_SIZE seeds[i]);
public void add(String value)
for (SimpleHash f : func)
bits set(f hash(value) true);
public boolean contains(String value)
if (value == null)
return false;
boolean ret = true;
for (SimpleHash f : func)
ret = ret && bits get(f hash(value));
return ret;
public class SimpleHash implements Serializable
private int cap;
private int seed;
public SimpleHash(int cap int seed)
this cap = cap;
this seed = seed;
public int hash(String value)
int result = ;
int len = value length();
for (int i = ; i < len; i++)
result = seed * result + value charAt(i);
return (cap ) & result;
@Override
public String toString()
// TODO Auto generated method stub
return super toString();
UtilSeriz java
view plaincopy to clipboardprint?
········· ········ ········ ········ ········ ········ ········ ········ ········ ········ ······· ······· ······· ······· ······· package hengking crawl;
import java io *;
public class UtilSeriz
/**
*将对象序列化到磁盘文件中
*@param
*@throwsException
*/
public static void writeObject(Object o String strPath) throws Exception
File f=new File(strPath);
if(f exists())
f delete();
FileOutputStream os=new FileOutputStream(f);
//ObjectOutputStream 核心类
ObjectOutputStream oos=new ObjectOutputStream(os);
oos writeObject(o);
oos close();
os close();
/**
*反序列化 将磁盘文件转化为对象
*@paramf
*@return
*@throwsException
*/
public static Object readObject(String strPath) throws Exception
File f=new File(strPath);
if(!f exists())
return null;
InputStream is=new FileInputStream(f);
//ObjectOutputStream 核心类
ObjectInputStream ois=new ObjectInputStream(is);
return ois readObject();
package hengking crawl;
import java io *;
public class UtilSeriz
/**
*将对象序列化到磁盘文件中
*@param
*@throwsException
*/
public static void writeObject(Object o String strPath) throws Exception
File f=new File(strPath);
if(f exists())
f delete();
FileOutputStream os=new FileOutputStream(f);
//ObjectOutputStream 核心类
ObjectOutputStream oos=new ObjectOutputStream(os);
oos writeObject(o);
oos close();
os close();
/**
*反序列化 将磁盘文件转化为对象
*@paramf
*@return
*@throwsException
*/
public static Object readObject(String strPath) throws Exception
File f=new File(strPath);
if(!f exists())
return null;
InputStream is=new FileInputStream(f);
//ObjectOutputStream 核心类
ObjectInputStream ois=new ObjectInputStream(is);
return ois readObject();
SearchCrawler java
view plaincopy to clipboardprint?
········· ········ ········ ········ ········ ········ ········ ········ ········ ········ ······· ······· ······· ······· ······· package hengking crawl;
import java awt image BufferedImage;
import java io BufferedInputStream;
import java io BufferedReader;
import java io BufferedWriter;
import java io File;
import java io FileOutputStream;
import java io FileWriter;
import java io IOException;
import java io InputStreamReader;
import URL;
import java text SimpleDateFormat;
import java util ArrayList;
import java util Calendar;
import java util Date;
import java util HashMap;
import java util LinkedHashSet;
import ncurrent Callable;
import ncurrent ExecutorService;
import ncurrent Executors;
import ncurrent Semaphore;
import java util regex Matcher;
import java util regex Pattern;
import javax imageio ImageIO;
import hengking crawl po PoCalSearch;
import hengking crawl po PoDownload;
/***
* 说明:抓图工具
* @author 君望永远
*
*/
public class SearchCrawler implements Runnable
/* disallowListCache缓存robot不允许搜索的URL Robot协议在Web站点的根目录下设置一个robots txt文件
*规定站点上的哪些页面是限制搜索的 搜索程序应该在搜索过程中跳过这些区域 下面是robots txt的一个例子:
# robots txt for
User agent: *
Disallow: /cgi bin/
Disallow: /registration # /Disallow robots on registration page
Disallow: /login
*/
public static SimpleBloomFilter filterUrl;
public static SimpleBloomFilter filterImg;
private HashMap< String ArrayList< String>> disallowListCache = new HashMap< String ArrayList< String>>();
ArrayList< String> errorList= new ArrayList< String>();//错误信息
ArrayList< String> result=new ArrayList< String>(); //搜索到的结果
String startUrl;//开始搜索的起点
LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();
boolean caseSensitive=false;//是否区分大小写
boolean limitHost=false;//是否在限制的主机内搜索
private static String outdir;
private static String seroutdir;
private static String seroutdirimg;
private boolean blnFlag=false;
private static PoCalSearch ps=null;
private static PoDownload pd=null;
// 个图片分析线程
private static ExecutorService execImg;
final Semaphore sempImg = new Semaphore( );
// 个网页分析线程
private static ExecutorService execPage;
final Semaphore sempPage = new Semaphore( );
private ArrayList<ParsePage> arrPar=new ArrayList<ParsePage>();
//记录抓图结果
private static BufferedWriter bw = null;
public SearchCrawler(String startUrl)
this startUrl=startUrl;
public ArrayList< String> getResult()
return result;
public void run()//启动搜索线程
new Thread(new TimeWrite File()) start();
blnFlag=true;
crawl(startUrl limitHost caseSensitive);
//检测URL格式
private URL verifyUrl(String url)
// 只处理HTTP URLs
if (!url toLowerCase() startsWith(// ))
return null;
URL verifiedUrl = null;
try
verifiedUrl = new URL(url);
catch (Exception e)
return null;
return verifiedUrl;
// 检测robot是否允许访问给出的URL
private boolean isRobotAllowed(URL urlToCheck)
String host = urlToCheck getHost() toLowerCase();//获取给出RUL的主机
//System out println( 主机= +host);
// 获取主机不允许搜索的URL缓存
ArrayList< String> disallowList =disallowListCache get(host);
// 如果还没有缓存 下载并缓存
if (disallowList == null)
disallowList = new ArrayList< String>();
try
URL robotsFileUrl =new URL(// + host + /robots txt );
BufferedReader reader =new BufferedReader(new InputStreamReader(robotsFileUrl openStream()));
// 读robot文件 创建不允许访问的路径列表
String line;
while ((line = reader readLine()) != null)
if (line indexOf( Disallow: ) == ) //是否包含 Disallow:
String disallowPath =line substring( Disallow: length());//获取不允许访问路径
// 检查是否有注释
int mentIndex = disallowPath indexOf( # );
if (mentIndex != )
disallowPath =disallowPath substring( mentIndex);//去掉注释
disallowPath = disallowPath trim();
disallowList add(disallowPath);
// 缓存此主机不允许访问的路径
disallowListCache put(host disallowList);
catch (Exception e)
return true; //web站点根目录下没有robots txt文件 返回真
String file = urlToCheck getFile();
//System out println( 文件getFile()= +file);
for (int i = ; i < disallowList size(); i++)
String disallow = disallowList get(i);
if (file startsWith(disallow))
return false;
return true;
private String downloadPage(URL pageUrl)
try
// Open connection to URL for reading
BufferedReader reader =
new BufferedReader(new InputStreamReader(pageUrl openStream()));
// Read page into buffer
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader readLine()) != null)
pageBuffer append(line);
return pageBuffer toString();
catch (Exception e)
e printStackTrace();
return null;
// 从URL中去掉
private String removeWwwFromUrl(String url)
int index = url indexOf( :// );
if (index != )
return url substring( index + ) +
url substring(index + );
return (url);
// 解析页面并找出链接
private ArrayList< String> retrieveLinks(URL pageUrl String pageContents
boolean limitHost)
// 用正则表达式编译链接的匹配模式
Pattern p =pile( <a\\\\s+href\\\\s*=\\\\s*\\ ?( *?)[\\ |>] Pattern CASE_INSENSITIVE);
Matcher m = p matcher(pageContents);
ArrayList< String> linkList = new ArrayList< String>();
while (m find())
String link = m group( ) trim();
if (link length() < )
continue;
// 跳过链到本页面内链接
if (link charAt( ) == # )
continue;
if (link indexOf( mailto: ) != )
continue;
if (link toLowerCase() indexOf( javascript ) != )
continue;
if (link indexOf( :// ) == )
if (link charAt( ) == / ) //处理绝对地
link = // + pageUrl getHost()+ : +pageUrl getPort()+ link;
else
String file = pageUrl getFile();
if (file indexOf( / ) == ) //处理相对地址
link = // + pageUrl getHost()+ : +pageUrl getPort() + / + link;
else
String path =file substring( file lastIndexOf( / ) + );
link = // + pageUrl getHost() + : +pageUrl getPort()+ path + link;
int index = link indexOf( # );
if (index != )
link = link substring( index);
link = removeWwwFromUrl(link);
URL verifiedLink = verifyUrl(link);
if (verifiedLink == null)
continue;
/* 如果限定主机 排除那些不合条件的URL*/
if (limitHost &&
!pageUrl getHost() toLowerCase() equals(
verifiedLink getHost() toLowerCase()))
continue;
// 跳过那些已经处理的链接
if(ntains(link))
logEvent( 匹配了: +link);
continue;
else
filterUrl add(link);
linkList add(link);
return (linkList);
// 解析页面并找出链接
private ArrayList< String> retrieveImgLinks(URL pageUrl String pageContents
boolean limitHost)
// 用正则表达式编译链接的匹配模式
Pattern p =pile( <img\\\\s+src\\\\s*=\\\\s*\\ ?( *?)[\\ |>] Pattern CASE_INSENSITIVE);
Matcher m = p matcher(pageContents);
ArrayList< String> linkList = new ArrayList< String>();
while (m find())
String link = m group( ) trim();
if (link length() < )
continue;
// 跳过链到本页面内链接
if (link charAt( ) == # )
continue;
if (link indexOf( mailto: ) != )
continue;
if (link toLowerCase() indexOf( javascript ) != )
continue;
if (link toLowerCase() endsWith( gif ))
continue;
if (link indexOf( :// ) == )
if (link charAt( ) == / )
//处理绝对地
link = // + pageUrl getHost()+ : +pageUrl getPort()+ link;
else
String file = pageUrl getFile();
if (file indexOf( / ) == ) //处理相对地址
link = // + pageUrl getHost()+ : +pageUrl getPort() + / + link;
else
String path =file substring( file lastIndexOf( / ) + );
link = // + pageUrl getHost() + : +pageUrl getPort()+ path + link;
int index = link indexOf( # );
if (index != )
link = link substring( index);
link = removeWwwFromUrl(link);
URL verifiedLink = verifyUrl(link);
if (verifiedLink == null)
continue;
/* 如果限定主机 排除那些不合条件的URL*/
if (limitHost &&
!pageUrl getHost() toLowerCase() equals(
verifiedLink getHost() toLowerCase()))
continue;
// 跳过那些已经处理的链接
// if (ntains(link))
// continue;
//
if(ntains(link))
logEvent( 图片匹配了: +link);
continue;
else
filterImg add(link);
if(link lastIndexOf( gif )== )
linkList add(link);
return (linkList);
//执行实际的搜索操作
public ArrayList< String> crawl(String startUrl boolean limithost boolean caseSensitive )
// 从开始URL中移出
startUrl = removeWwwFromUrl(startUrl);
toCrawlList add(startUrl);
int idxPageParse= ;
while (toCrawlList size()> )
try
idxPageParse++;
// Get URL at bottom of the list
String url = erator() next();
ps setIntUrl(ps getIntUrl()+ );
// Remove URL from the to crawl list
toCrawlList remove(url);
int intRetryPage= ;
while (sempPage availablePermits()<= )
System out println( 暂时没有空闲的网页分析线程 等待 秒再执行 );
try
intRetryPage++;
if(intRetryPage== )
logEvent( 分析网页 +url+ 超时 );
sempPage release();
break;
Thread sleep( );
catch (InterruptedException e)
e printStackTrace();
ParsePage tempPageThread=new ParsePage(url);
execPage submit(tempPageThread);
logEvent( 开启网页分析线程 +idxPageParse);
if(idxPageParse== )
Thread currentThread() sleep( );
catch(Exception e)
e printStackTrace();
blnFlag=false;
logEvent( 抓图完成 );
return result;
public static void logEvent(String strLog)
System out println( new SimpleDateFormat( yyyy年MM月dd日HH时mm分ss秒 ) format(new Date(Calendar getInstance() getTimeInMillis()))+ =====> +strLog);
// 主函数
public static void main(String[] args)
if(args length!= )
System out println( Usage:java SearchCrawler startUrl maxUrl searchString );
return;
@SuppressWarnings( unused )
String strLogPath=args[ ];
SearchCrawler crawler = new SearchCrawler(args[ ]);
outdir=args[ ]+ /pic +new SimpleDateFormat( yyyyMMdd ) format(new Date(Calendar getInstance() getTimeInMillis()))+ / ;
File f=new File(outdir);
if(!f exists())
f mkdir();
execPage = Executors newFixedThreadPool( );
execImg = Executors newFixedThreadPool( );
seroutdir=args[ ];
seroutdirimg=args[ ];
ps=new PoCalSearch();
pd=new PoDownload();
try
if(UtilSeriz readObject(seroutdir)!=null)
System out println(new SimpleDateFormat( yyyy年MM月dd日HH时mm分ss秒 ) format(new Date(Calendar getInstance() getTimeInMillis()))+ =====> + 反序列化URL );
filterUrl=(SimpleBloomFilter)UtilSeriz readObject(seroutdir);
else
filterUrl=new SimpleBloomFilter();
if(UtilSeriz readObject(seroutdir)!=null)
System out println(new SimpleDateFormat( yyyy年MM月dd日HH时mm分ss秒 ) format(new Date(Calendar getInstance() getTimeInMillis()))+ =====> + 反序列化图片 );
filterImg=(SimpleBloomFilter)UtilSeriz readObject(seroutdirimg);
else
filterImg=new SimpleBloomFilter();
catch (Exception e)
e printStackTrace();
String strPic=args[ ]+ /pic +new SimpleDateFormat( yyyyMMdd ) format(new Date(Calendar getInstance() getTimeInMillis()))+ log ;
try
bw=new BufferedWriter(new FileWriter(strPic false));
catch (IOException e)
// TODO Auto generated catch block
e printStackTrace();
Thread search=new Thread(crawler);
System out println( new SimpleDateFormat( yyyy年MM月dd日HH时mm分ss秒 ) format(new Date(Calendar getInstance() getTimeInMillis()))+ =====> + 开始爬图 );
System out println( 下载了图: );
search start();
try
search join();
logEvent( 主函数结束 );
bw close();
catch (Exception e)
// TODO Auto generated catch block
e printStackTrace();
/**
* 说明:下载图片的线程
* @author binbin
*
*/
public class ImgDownThread implements Runnable Callable<Long>
//待下载的URL
private String stru;
private boolean isStart=true;
public ImgDownThread(String strurl)
super();
this stru = strurl;
@Override
public void run()
try
sempImg acquire();
try
URL url=new URL(stru);
BufferedInputStream in = new BufferedInputStream(url openStream());
BufferedImage bi=ImageIO read(url openStream());
//尺寸要求
if (bi==null|| bi getWidth()< || bi getHeight()< )
in close();
return;
String ss=new SimpleDateFormat( yyyyMMddHHmmss ) format(new Date(Calendar getInstance() getTimeInMillis()))+ _ +Math round(Math random()* L+ )+stru substring(stru lastIndexOf( ));
String s=outdir+ss;
FileOutputStream file = new FileOutputStream(new File(s));
int t;
while ((t = in read()) != )
file write(t);
file close();
if(new File(s) length()<= * )
in close();
new File(s) delete();
return;
synchronized(bw)
String str=ss+ : +stru;
bw write(str);
bw newLine();
bw flush();
logEvent( 下载了: +stru);
ps setIntImg(ps getIntImg()+ );
in close();
catch(Exception e)
logEvent( **********************下载图片: +stru+ 超时 );
catch (Exception e)
e printStackTrace();
finally
sempImg release();
public boolean isStart()
return isStart;
public void setStart(boolean isStart)
this isStart = isStart;
@Override
public Long call() throws Exception
try
sempImg acquire();
try
URL url=new URL(stru);
BufferedInputStream in = new BufferedInputStream(url openStream());
BufferedImage bi=ImageIO read(url openStream());
//尺寸要求
if (bi==null|| bi getWidth()< || bi getHeight()< )
in close();
return l;
String ss=new SimpleDateFormat( yyyyMMddHHmmss ) format(new Date(Calendar getInstance() getTimeInMillis()))+ _ +Math round(Math random()* L+ )+stru substring(stru lastIndexOf( ));
String s=outdir+ss;
FileOutputStream file = new FileOutputStream(new File(s));
int t;
while ((t = in read()) != )
file write(t);
file close();
if(new File(s) length()<= * )
in close();
new File(s) delete();
return l;
logEvent( 下载了: +stru);
ps setIntImg(ps getIntImg()+ );
in close();
catch(Exception e)
logEvent( **********************下载图片: +stru+ 超时 );
catch (Exception e)
e printStackTrace();
finally
sempImg release();
return l;
/***
* 序列化已访问的URL
* @author binbin
*
*/
public class TimeWrite File implements Runnable
@Override
public void run()
while(blnFlag)
try
synchronized(ps)
logEvent( 开始序列化URL );
UtilSeriz writeObject(filterUrl seroutdir);
logEvent( 结束序列化URL );
logEvent( 开始序列化图片 );
UtilSeriz writeObject(filterImg seroutdirimg);
logEvent( 结束序列化图片 );
logEvent( 分析了 +ps getIntUrl()+ 个链接 );
logEvent( 下载了 +ps getIntImg()+ 张图片 );
Thread sleep( );
catch (Exception e)
相关参考
Delphi屏幕抓图技术的实现 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!摘要本文以Delphi
基于Delphi的屏幕抓图技术的实现 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧! &n
知识大全 Java程序性能优化--让你的Java程序更快、更稳定
Java程序性能优化--让你的Java程序更快、更稳定 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧
Java应用程序和小应用程序 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧! Java程序可以是独
Java程序性能优化-看懂程序的性能 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!
如何优化JAVA程序开发,提高JAVA性能 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧! 通过使
Java程序编码规范与技巧 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧! java程序编码规范
Java进阶:Java编写通过代理访问的应用程序 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!
如何保护Java程序 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧! Java是一种跨平台的解释型
Java程序里的内存泄漏 以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧! Java程序里的内存泄漏