JAVA抓取网页的图片,JAVA利用正则表达式抓取网站图片实例源码



JAVA抓取网页的图片,JAVA利用正则表达式抓取网站图片实例源码。利用java抓取网页上的所有图片:

用两个正则表达式:

1、匹配html中img标签的正则:<img.*src=(.*?)[^>]*?>

2、匹配img标签中得src中http路径的正则:http:\”?(.*?)(\”|>|\\s+)

实现:

[java] view plaincopyprint?

    package org.swinglife.main; 
     
    import java.io.File; 
    import java.io.FileOutputStream; 
    import java.io.InputStream; 
    import java.net.URL; 
    import java.net.URLConnection; 
    import java.util.ArrayList; 
    import java.util.List; 
    import java.util.regex.Matcher; 
    import java.util.regex.Pattern; 
     
    /***
     * java抓取网络图片
     * @author swinglife
     *
     */ 
    public class CatchImage { 
     
        // 地址 
        private static final String URL = “http://www.csdn.net”; 
        // 编码 
        private static final String ECODING = “UTF-8″; 
        // 获取img标签正则 
        private static final String IMGURL_REG = “<img.*src=(.*?)[^>]*?>”; 
        // 获取src路径的正则 
        private static final String IMGSRC_REG = “http:\”?(.*?)(\”|>|\\s+)”; 
     
         
        public static void main(String[] args) throws Exception { 
            CatchImage cm = new CatchImage(); 
            //获得html文本内容 
            String HTML = cm.getHTML(URL); 
            //获取图片标签 
            List<String> imgUrl = cm.getImageUrl(HTML); 
            //获取图片src地址 
            List<String> imgSrc = cm.getImageSrc(imgUrl); 
            //下载图片 
            cm.Download(imgSrc); 
        } 
         
         
        /***
         * 获取HTML内容
         * 
         * @param url
         * @return
         * @throws Exception
         */ 
        private String getHTML(String url) throws Exception { 
            URL uri = new URL(url); 
            URLConnection connection = uri.openConnection(); 
            InputStream in = connection.getInputStream(); 
            byte[] buf = new byte[1024]; 
            int length = 0; 
            StringBuffer sb = new StringBuffer(); 
            while ((length = in.read(buf, 0, buf.length)) > 0) { 
                sb.append(new String(buf, ECODING)); 
            } 
            in.close(); 
            return sb.toString(); 
        } 
     
        /***
         * 获取ImageUrl地址
         * 
         * @param HTML
         * @return
         */ 
        private List<String> getImageUrl(String HTML) { 
            Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML); 
            List<String> listImgUrl = new ArrayList<String>(); 
            while (matcher.find()) { 
                listImgUrl.add(matcher.group()); 
            } 
            return listImgUrl; 
        } 
     
        /***
         * 获取ImageSrc地址
         * 
         * @param listImageUrl
         * @return
         */ 
        private List<String> getImageSrc(List<String> listImageUrl) { 
            List<String> listImgSrc = new ArrayList<String>(); 
            for (String image : listImageUrl) { 
                Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image); 
                while (matcher.find()) { 
                    listImgSrc.add(matcher.group().substring(0, matcher.group().length() – 1)); 
                } 
            } 
            return listImgSrc; 
        } 
     
        /***
         * 下载图片
         * 
         * @param listImgSrc
         */ 
        private void Download(List<String> listImgSrc) { 
            try { 
                for (String url : listImgSrc) { 
                    String imageName = url.substring(url.lastIndexOf(“/”) + 1, url.length()); 
                    URL uri = new URL(url); 
                    InputStream in = uri.openStream(); 
                    FileOutputStream fo = new FileOutputStream(new File(imageName)); 
                    byte[] buf = new byte[1024]; 
                    int length = 0; 
                    System.out.println(“开始下载:” + url); 
                    while ((length = in.read(buf, 0, buf.length)) != -1) { 
                        fo.write(buf, 0, length); 
                    } 
                    in.close(); 
                    fo.close(); 
                    System.out.println(imageName + “下载完成”); 
                } 
            } catch (Exception e) { 
                System.out.println(“下载失败”); 
            } 
        } 
     
         
    }