知识大全 使用java将网页保存为mht格式

Posted 类型

篇首语:新长征路上,有风有雨是常态,风雨无阻是心态,风雨兼程是状态。本文由小常识网(cha138.com)小编为大家整理,主要介绍了知识大全 使用java将网页保存为mht格式相关的知识,希望对你有一定的参考价值。

  package tag;

  import java io BufferedInputStream;

  import java io BufferedOutputStream;

  import java io BufferedReader;

  import java io ByteArrayInputStream;

  import java io DataOutputStream;

  import java io File;

  import java io FileInputStream;

  import java io FileOutputStream;

  import java io FileWriter;

  import java io IOException;

  import java io InputStream;

  import java io InputStreamReader;

  import java io OutputStream;

  import java io Reader;

  import MalformedURLException;

  import URL;

  import java util *;

  import lparser Parser;

  import lparser Tag;

  import lparser filters TagNameFilter;

  import lparser lexer Lexer;

  import lparser lexer Page;

  import lparser util DefaultParserFeedback;

  import lparser util NodeList;

  import lparser util ParserException;

  import toptrack tools JQuery;

  import javax activation DataHandler;

  import javax activation DataSource;

  import javax activation MimetypesFileTypeMap;

  import javax mail Message;

  import javax mail MessagingException;

  import javax mail Multipart;

  import javax mail Session;

  import javax mail internet InternetAddress;

  import javax mail internet MimeBodyPart;

  import javax mail internet MimeMessage;

  import javax mail internet MimeMultipart;

  import javax mail internet MimePartDataSource;

  /**

  * mht文件解析类

  * @author dl

  */

  public class Html MHTCompiler

  private URL strWeb = null; /**网页地址*/

  private String strText = null; /**网页文本内容*/

  private String strFileName = null; /**本地文件名*/

  private String strEncoding = null; /**网页编码*/

  //mht格式附加信息

  private String from = ;

  private String to;

  private String subject = mht pile ;

  private String cc;

  private String bcc;

  private String smtp = localhost ;

  public static void main(String[] args)

  String strUrl = ;

  String strEncoding = utf ;

  String strText = JQuery getHtmlText(strUrl strEncoding null);

  if (strText == null)

  return;

  Html MHTCompiler h t = new Html MHTCompiler(strText strUrl strEncoding test mht );

  pile();

  //Html MHTCompiler mht ( test mht l );

  

  /**

  *<br>方法说明 初始化

  *<br>输入参数 strText 网页文本内容; strUrl 网页地址; strEncoding 网页编码; strFileName 本地文件名

  *<br>返回类型

  */

  public Html MHTCompiler(String strText String strUrl String strEncoding String strFileName)

  // TODO Auto generated constructor stub

  try

  strWeb = new URL(strUrl);

   catch (MalformedURLException e)

  // TODO Auto generated catch block

  e printStackTrace();

  return;

  

  this strText = strText;

  this strEncoding = strEncoding;

  this strFileName = strFileName;

  

  /**

  *<br>方法说明 执行下载操作

  *<br>输入参数

  *<br>返回类型

  */

  public boolean pile()

  if (strWeb == null || strText == null || strFileName == null || strEncoding == null)

  return false;

  HashMap urlMap = new HashMap();

  NodeList nodes = new NodeList();

  try

  Parser parser = createParser(strText);

  parser setEncoding(strEncoding);

  nodes = parser parse(null);

   catch (ParserException e)

  // TODO Auto generated catch block

  e printStackTrace();

  

  extractAllScriptNodes(nodes);

  ArrayList urlScriptList = extractAllScriptNodes(nodes urlMap);

  ArrayList urlImageList = extractAllImageNodes(nodes urlMap);

  for (Iterator iter = urlMap entrySet(erator(); iter hasNext();)

  Map Entry entry = (Map Entry) iter next();

  String key = (String)entry getKey();

  String val = (String)entry getValue();

  strText = JHtmlClear replace(strText val key);

  

  try

  createMhtArchive(strText urlScriptList urlImageList);

   catch (Exception e)

  // TODO Auto generated catch block

  e printStackTrace();

  return false;

  

  return true;

  

  /**

  *<br>方法说明 建立HTML parser

  *<br>输入参数 inputHTML 网页文本内容

  *<br>返回类型 HTML parser

  */

  private Parser createParser(String inputHTML)

  // TODO Auto generated method stub

  Lexer mLexer = new Lexer(new Page(inputHTML));

  return new Parser(mLexer new DefaultParserFeedback(DefaultParserFeedback QUIET));

  

  /**

  *<br>方法说明 抽取基础URL地址

  *<br>输入参数 nodes 网页标签集合

  *<br>返回类型

  */

  private void extractAllScriptNodes(NodeList nodes)

  NodeList filtered = nodes extractAllNodesThatMatch(new TagNameFilter(

   BASE ) true);

  if (filtered != null && filtered size() > )

  Tag tag = (Tag) filtered elementAt( );

  String href = tag getAttribute( href );

  if (href != null && href length() > )

  try

  strWeb = new URL(href);

   catch (MalformedURLException e)

  // TODO Auto generated catch block

  e printStackTrace();

  

  

  

  

  /**

  *<br>方法说明 抽取网页包含的css js链接

  *<br>输入参数 nodes 网页标签集合; urlMap 已存在的url集合

  *<br>返回类型 css js链接的集合

  */

  private ArrayList extractAllScriptNodes(NodeList nodes HashMap urlMap)

  ArrayList urlList = new ArrayList();

  NodeList filtered = nodes extractAllNodesThatMatch(new TagNameFilter( script ) true);

  for (int i = ; i < filtered size(); i++)

  Tag tag = (Tag) filtered elementAt(i);

  String src = tag getAttribute( src );

  // Handle external css file s url

  if (src != null && src length() > )

  String innerURL = src;

  String absoluteURL = makeAbsoluteURL(strWeb innerURL);

  if (absoluteURL != null && !ntainsKey(absoluteURL))

  urlMap put(absoluteURL innerURL);

  ArrayList urlInfo = new ArrayList();

  urlInfo add(innerURL);

  urlInfo add(absoluteURL);

  urlList add(urlInfo);

  

  tag setAttribute( src absoluteURL);

  

  

  filtered = nodes extractAllNodesThatMatch(new TagNameFilter( link ) true);

  for (int i = ; i < filtered size(); i++)

  Tag tag = (Tag) filtered elementAt(i);

  String type = (tag getAttribute( type ));

  String rel = (tag getAttribute( rel ));

  String href = tag getAttribute( href );

  boolean isCssFile = false;

  if (rel != null)

  isCssFile = rel indexOf( stylesheet ) != ;

   else if (type != null)

  isCssFile |= type indexOf( text/css ) != ;

  

  // Handle external css file s url

  if (isCssFile && href != null && href length() > )

  String innerURL = href;

  String absoluteURL = makeAbsoluteURL(strWeb innerURL);

  if (absoluteURL != null && !ntainsKey(absoluteURL))

  urlMap put(absoluteURL innerURL);

  ArrayList urlInfo = new ArrayList();

  urlInfo add(innerURL);

  urlInfo add(absoluteURL);

  urlList add(urlInfo);

  

  tag setAttribute( href absoluteURL);

  

  

  return urlList;

  

  /**

  *<br>方法说明 抽取网页包含的图像链接

  *<br>输入参数 nodes 网页标签集合; urlMap 已存在的url集合

  *<br>返回类型 图像链接集合

  */

  private ArrayList extractAllImageNodes(NodeList nodes HashMap urlMap)

  ArrayList urlList = new ArrayList();

  NodeList filtered = nodes extractAllNodesThatMatch(new TagNameFilter( IMG ) true);

  for (int i = ; i < filtered size(); i++)

  Tag tag = (Tag) filtered elementAt(i);

  String src = tag getAttribute( src );

  // Handle external css file s url

  if (src != null && src length() > )

  String innerURL = src;

  String absoluteURL = makeAbsoluteURL(strWeb innerURL);

  if (absoluteURL != null && !ntainsKey(absoluteURL))

  urlMap put(absoluteURL innerURL);

  ArrayList urlInfo = new ArrayList();

  urlInfo add(innerURL);

  urlInfo add(absoluteURL);

  urlList add(urlInfo);

  

  tag setAttribute( src absoluteURL);

  

  

  return urlList;

  

  /**

  *<br>方法说明 相对路径转绝对路径

  *<br>输入参数 strWeb 网页地址; innerURL 相对路径链接

  *<br>返回类型 绝对路径链接

  */

  public static String makeAbsoluteURL(URL strWeb String innerURL)

  // TODO Auto generated method stub

  //去除后缀

  int pos = innerURL indexOf( ? );

  if (pos != )

  innerURL = innerURL substring( pos);

  

  if (innerURL != null

  && innerURL toLowerCase() indexOf( ) == )

  System out println(innerURL);

  return innerURL;

  

  URL linkUri = null;

  try

  linkUri = new URL(strWeb innerURL);

   catch (MalformedURLException e)

  //TODO Auto generated catch block

  e printStackTrace();

  return null;

  

  String absURL = linkUri toString();

  absURL = JHtmlClear replace(absURL / );

  absURL = JHtmlClear replace(absURL / );

  System out println(absURL);

  return absURL;

  

  /**

  *<br>方法说明 创建mht文件

  *<br>输入参数 content 网页文本内容; urlScriptList 脚本链接集合; urlImageList 图片链接集合

  *<br>返回类型

  */

  private void createMhtArchive(String content ArrayList urlScriptList ArrayList urlImageList) throws Exception

  //Instantiate a Multipart object

  MimeMultipart mp = new MimeMultipart( related );

  Properties props = new Properties();

  props put( mail smtp host smtp);

  Session session = Session getDefaultInstance(props null);

  MimeMessage msg = new MimeMessage(session);

  // set mailer

  msg setHeader( X Mailer Code Manager SWT );

  // set from

  if (from != null)

  msg setFrom(new InternetAddress(from));

  

  // set subject

  if (subject != null)

  msg setSubject(subject);

  

  // to

  if (to != null)

  InternetAddress[] toAddresses = getInetAddresses(to);

  msg setRecipients(Message RecipientType TO toAddresses);

  

  // cc

  if (cc != null)

  InternetAddress[] ccAddresses = getInetAddresses(cc);

  msg setRecipients(Message RecipientType CC ccAddresses);

  

  // bcc

  if (bcc != null)

  InternetAddress[] bccAddresses = getInetAddresses(bcc);

  msg setRecipients(Message RecipientType BCC bccAddresses);

  

  //设置网页正文

  MimeBodyPart bp = new MimeBodyPart();

  bp setText(content strEncoding);

  bp addHeader( Content Type text/;charset= + strEncoding);

  bp addHeader( Content Location strWeb toString());

  mp addBodyPart(bp);

  int urlCount = urlScriptList size();

  for (int i = ; i < urlCount; i++)

  bp = new MimeBodyPart();

  ArrayList urlInfo = (ArrayList) urlScriptList get(i);

  // String url = urlInfo get( ) toString();

  String absoluteURL = urlInfo get( ) toString();

  bp

   addHeader( Content Location

  javax mail internet MimeUtility

   encodeWord( URLDecoder

   decode(absoluteURL strEncoding)));

  DataSource source = new AttachmentDataSource(absoluteURL text );

  bp setDataHandler(new DataHandler(source));

  mp addBodyPart(bp);

  

  urlCount = urlImageList size();

  for (int i = ; i < urlCount; i++)

  bp = new MimeBodyPart();

  ArrayList urlInfo = (ArrayList) urlImageList get(i);

  // String url = urlInfo get( ) toString();

  String absoluteURL = urlInfo get( ) toString();

  bp

   addHeader( Content Location

  javax mail internet MimeUtility

   encodeWord( URLDecoder

   decode(absoluteURL strEncoding)));

  DataSource source = new AttachmentDataSource(absoluteURL image );

  bp setDataHandler(new DataHandler(source));

  mp addBodyPart(bp);

  

  msg setContent(mp);

  // write the mime multi part message to a file

  msg writeTo(new FileOutputStream(strFileName));

  

  /**

  *<br>方法说明 mht转

  *<br>输入参数 strMht mht文件路径; strHtml 文件路径

  *<br>返回类型

  */

  public static void mht (String strMht String strHtml)

  try

  //TODO readEmlFile

  InputStream fis = new FileInputStream(strMht);

  Session mailSession = Session getDefaultInstance(System getProperties() null);

  MimeMessage msg = new MimeMessage(mailSession fis);

  Object content = msg getContent();

  if (content instanceof Multipart)

  MimeMultipart mp = (MimeMultipart)content;

  MimeBodyPart bp = (MimeBodyPart)mp getBodyPart( );

  String strEncodng = getEncoding(bp );

  String strText = getHtmlText(bp strEncodng);

  if (strText == null)

  return;

  File parent = null;

  if (mp getCount() > )

  parent = new File(new File(strHtml) getAbsolutePath() + files );

  parent mkdirs();

  if (!parent exists())

  return;

  

  for (int i = ; i < mp getCount(); ++i)

  MimeBodyPart bp = (MimeBodyPart)mp getBodyPart(i);

  String strUrl = getResourcesUrl(bp);

  if (strUrl == null)

  continue;

  DataHandler dataHandler = bp getDataHandler();

  MimePartDataSource source = (MimePartDataSource)dataHandler getDataSource();

  File resources = new File(parent getAbsolutePath() + File separator + getName(strUrl i));

  if (saveResourcesFile(resources bp getInputStream()))

  strText = JHtmlClear replace(strText strUrl resources getAbsolutePath());

  

  saveHtml(strText strHtml);

  

   catch (Exception e)

  // TODO Auto generated catch block

  e printStackTrace();

  

  

  /**

  *<br>方法说明 得到资源文件的name

  *<br>输入参数 strName 资源文件链接 ID 资源文件的序号

  *<br>返回类型 资源文件的本地临时文件名

  */

  public static String getName(String strName int ID)

  char separator = / ;

  System out println(strName);

  System out println(separator);

  if( strName lastIndexOf(separator) >= )

  return format(strName substring(strName lastIndexOf(separator) + ));

  return temp + ID;

  

  /**

  *<br>方法说明 得到网页编码

  *<br>输入参数 bp MimeBodyPart类型的网页内容

  *<br>返回类型 MimeBodyPart里的网页内容的编码

  */

  private static String getEncoding(MimeBodyPart bp)

  if (bp != null)

  try

  Enumeration list = bp getAllHeaders();

  while (list hasMoreElements())

  javax mail Header head = (javax mail Header)list nextElement();

  if (head getName(pareTo( Content Type ) == )

  String strType = head getValue();

  int pos = strType indexOf( charset= );

  if (pos != )

  String strEncoding = strType substring(pos + strType length());

  if (strEncoding toLowerCase(pareTo( gb ) == )

  strEncoding = gbk ;

  

  return strEncoding;

  

  

  

   catch (MessagingException e)

  // TODO Auto generated catch block

  e printStackTrace();

  

  

  return null;

  

  /**

  *<br>方法说明 得到资源文件url

  *<br>输入参数 bp MimeBodyPart类型的网页内容

  *<br>返回类型 资源文件url

  */

  private static String getResourcesUrl(MimeBodyPart bp)

  if (bp != null)

  try

  Enumeration list = bp getAllHeaders();

  while (list hasMoreElements())

  javax mail Header head = (javax mail Header)list nextElement();

  if (head getName(pareTo( Content Location ) == )

  return head getValue();

  

  

   catch (MessagingException e)

  // TODO Auto generated catch block

  e printStackTrace();

  

  

  return null;

  

  /**

  *<br>方法说明 格式化文件名

  *<br>输入参数 strName 文件名

  *<br>返回类型 经过处理的符合命名规则的文件名

  */

  private static String format(String strName)

  if (strName == null)

  return null;

  strName = strName replaceAll(      );

  String strText = \\\\/:*?\\ <>|^___FCKpd___ quot;;

  for (int i = ; i < strName length(); ++i)

  String ch = String valueOf(strName charAt(i));

  if (strText indexOf(ch) != )

  strName = strName replace(strName charAt(i) );

  

  

  return strName;

  

  /**

  *<br>方法说明 保存资源文件

  *<br>输入参数 resources 要创建的资源文件; inputStream 要输入文件中的流

  *<br>返回类型 boolean

  */

  private static boolean saveResourcesFile(File resources InputStream inputStream)

  if (resources == null || inputStream == null)

  return false;

  

  BufferedInputStream in = null;

  FileOutputStream fio = null;

  BufferedOutputStream osw = null;

  try

  in = new BufferedInputStream(inputStream);

  fio = new FileOutputStream(resources);

  osw = new BufferedOutputStream(new DataOutputStream(fio));

  int b;

  byte[] a = new byte[ ];

  boolean isEmpty = true;

  while ((b = in read(a)) != )

  isEmpty = false;

  osw write(a b);

  osw flush();

  

  osw close();

  fio close();

  in close();

  inputStream close();

  if (isEmpty)

  resources delete();

  return true;

   catch (Exception e)

  // TODO Auto generated catch block

  e printStackTrace();

  System out println( 解析mht失败 );

  return false;

   finally

  try

  if (osw != null)

  osw close();

  if (fio != null)

  fio close();

  if (in != null)

  in close();

  if (inputStream != null)

  inputStream close();

   catch (Exception e)

  e printStackTrace();

  System out println( 解析mht失败 );

  return false;

  

  

  

  /**

  *<br>方法说明 得到mht文件的标题

  *<br>输入参数 mhtFilename mht文件名

  *<br>返回类型 mht文件的标题

  */

  public static String getTitle(String mhtFilename)

  try

  //TODO readEmlFile

  InputStream fis = new FileInputStream(mhtFilename);

  Session mailSession = Session getDefaultInstance(System getProperties() null);

  MimeMessage msg = new MimeMessage(mailSession fis);

  Object content = msg getContent();

  if (content instanceof Multipart)

  MimeMultipart mp = (MimeMultipart)content;

  MimeBodyPart bp = (MimeBodyPart)mp getBodyPart( );

  String strEncodng = getEncoding(bp );

  String strText = getHtmlText(bp strEncodng);

  if (strText == null)

  return null;

  strText = strText toLowerCase();

  int pos = strText indexOf( <title> );

  int pos = strText indexOf( </title> );

  if (pos != && pos != && pos > pos )

  return strText substring(pos + pos ) trim();

  

  

  return null;

   catch (Exception e)

  // TODO Auto generated catch block

  e printStackTrace();

  return null;

  

  

  /**

  *<br>方法说明 得到文本

  *<br>输入参数 bp MimeBodyPart类型的网页内容; strEncoding 内容编码

  *<br>返回类型 文本

  */

  private static String getHtmlText(MimeBodyPart bp String strEncoding)

  InputStream textStream = null;

  BufferedInputStream buff = null;

  BufferedReader br = null;

  Reader r = null;

  try

  textStream = bp getInputStream();

  buff = new BufferedInputStream(textStream);

  r = new InputStreamReader(buff strEncoding);

  br = new BufferedReader(r);

  StringBuffer strHtml = new StringBuffer( );

  String strLine = null;

  while ((strLine = br readLine()) != null)

  strHtml append(strLine + \\r\\n );

  

  br close();

  r close();

  textStream close();

  return strHtml toString();

   catch (Exception e)

  // TODO Auto generated catch block

  e printStackTrace();

   finally

  try

  if (br != null)

  br close();

  if (buff != null)

  buff close();

  if (textStream != null)

  textStream close();

  catch(Exception e)

  System out println( 解析mht失败 );

  

  

  return null;

  

  /**

  *<br>方法说明 保存文件

  *<br>输入参数 strText 内容; strHtml 文件名

  *<br>返回类型

  */

  private static void saveHtml(String strText String strHtml)

  try

  FileWriter fw = new FileWriter(strHtml);

  fw write(strText);

  fw close();

   catch (IOException e)

  // TODO Auto generated catch block

  e printStackTrace();

  System out println( 解析mht失败 );

  

  

  private InternetAddress[] getInetAddresses(String emails) throws Exception

  ArrayList list = new ArrayList();

  StringTokenizer tok = new StringTokenizer(emails );

  while (tok hasMoreTokens())

  list add(tok nextToken());

  

  int count = list size();

  InternetAddress[] addresses = new InternetAddress[count];

  for (int i = ; i < count; i++)

  addresses[i] = new InternetAddress(list get(i) toString());

  

  return addresses;

  

  class AttachmentDataSource implements DataSource

  private MimetypesFileTypeMap map = new MimetypesFileTypeMap();

  private String strUrl;

  private String strType;

  private byte[] dataSize = null;

  /**

  * This is some content type maps

  */

  private Map normalMap = new HashMap();

  

  // Initiate normal mime type map

  // Images

  normalMap put( image image/jpeg );

  normalMap put( text text/plain );

  

  public AttachmentDataSource(String strUrl String strType)

  this strType = strType;

  this strUrl = strUrl;

  strUrl = strUrl trim();

  strUrl = strUrl replaceAll( % );

  dataSize = JQuery downBinaryFile(strUrl null);

  

  /**

  * Returns the content type

  */

  public String getContentType()

  return getMimeType(getName());

  

  public String getName()

  char separator = File separatorChar;

  if( strUrl lastIndexOf(separator) >= )

  return strUrl substring(strUrl lastIndexOf(separator) + );

  return strUrl;

  

  private String getMimeType(String fileName)

  String type = (String)normalMap get(strType);

  if (type == null)

  try

  type = map getContentType(fileName);

   catch (Exception e)

  // TODO: handle exception

  

  System out println(type);

  // Fix the null exception

  if (type == null)

  type = application/octet stream ;

  

  

  return type;

  

  public InputStream getInputStream() throws IOException

  // TODO Auto generated method stub

  if (dataSize == null)

  dataSize = new byte[ ];

  return new ByteArrayInputStream(dataSize);

  

  public OutputStream getOutputStream() throws IOException

  // TODO Auto generated method stub

  return new java io ByteArrayOutputStream();

  

  

  

cha138/Article/program/Java/hx/201311/26795

相关参考

知识大全 如何将C#时间格式转换为中文格式

如何将C#时间格式转换为中文格式  以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!  首先使用Java

知识大全 js将long日期格式转换为标准日期格式实现思路

js将long日期格式转换为标准日期格式  复制代码代码如下:cha138/Article/program/Java/JSP/201405/30753

知识大全 如何定义404错误页面

方法如下创建一个HTML文件为您的页这可能是任何你想这可能是只是一个空白页也可以是您的布局其中包括字它这一切取决于您的意愿保存该文件在任何文件格式您想要(HTML格式PHP的等;)并把它上传到您的网页

知识大全 用JS快速保存网页中所有图片的方法

用JS快速保存网页中所有图片的方法  以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!打开一个欲保存所有

知识大全 3dmax中的动画怎么保存成AVI格式

3dmax中的动画怎么保存成AVI格式!  以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!3dmax中

下列数字图像的文件格式中,能够在网页上发布并可以具有动画效果的是

下列数字图像的文件格式中,能够在网页上发布并可以具有动画效果的是_____。A、BMPB、GIFC、JPEGD、TIFF答案:B解析:GIF文件格式的数字图像具有动画效果。

知识大全 Java实现网页自动登录

Java实现网页自动登录  以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!  我不得不使用好几个系统都

知识大全 Java格式化日期

Java格式化日期  以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧!  Java代码   &

知识大全 Java如何实现网页程序自动登录

Java如何实现网页程序自动登录  以下文字资料是由(全榜网网www.cha138.com)小编为大家搜集整理后发布的内容,让我们赶快一起来看一下吧! &nbs

知识大全 在word中制作的简历可以以什么格式保存

在word中制作的简历可以以什么格式保存?这要分两种情况:一般对于在Word中已经完成(确定无需再编辑或修改)的简历文件,建议用PDF格式进行保存。因为它具有跨平台、能保留文件原有格式(Layout)