C#采集网站页面的所有图片与标题内容

 新闻资讯     |      2018-05-31 21:53

原理很简单:抓取网页源码=>正则匹配图片地址=>使用C#提供的方法下载图片

如果是缩略图形式,再获取A标签的HREF地址,过滤出图片地址即可。

 代码如下 static string GetImgUrl(string html) 
        { 
            //string regex = @"<img.*?src=[""'](.+?)[""'].*?>";//这个不可以匹配没有单引号和双引号的情况 
            string regex = @"<img.*?src=['""]?([^""'s]+)"; 
            //string regex=@"(?is)<img[sS]*?src=['""]?([^'""s]+)"; 
            StringBuilder builder = new StringBuilder(); 
            MatchCollection mc = Regex.Matches(html, regex, RegexOptions.IgnoreCase | RegexOptions.Multiline); 
            for (int i = 0; i < mc.Count; i++) 
            { 
                builder.Append(mc[i].Groups[1].Value); 
                builder.Append("@"); 
            } 
            return builder.ToString(); 
        } 
  
        static void Main(string[] args) 
        { 
            string html = GetHtmlInfo("www.baidu.com", 15000, Encoding.GetEncoding("GBK")); 
            Console.WriteLine(html); 
  
            //html = "<dsd<img alt='www ' src=0.jpg />ssd<img alt='dff ' src='1.jpg' title='AA'><img src='http://filesimg.111cn.net/2013/11/07/20131107021918325.jpg'>dsds<img src="3.png" />  fd<img    src='4.jpg'><img title='dsds' src='5.jpg'>"; 
            //Console.WriteLine(GetImgUrl(html)); 
  
  
            string[] imgarr = GetImgUrl(html).TrimEnd('@').Split('@');//去掉最后一个@符号,再分割成数组 
            foreach (string str in imgarr) 
            { 
                SaveImg(str); 
                Console.WriteLine(str); 
            } 
            Console.Read(); 
  
        } 
        /// <summary> 
        /// 下载指定页面所有图片 
        /// </summary> 
        /// <param name="imgurl"></param> 
        static void SaveImg(string imgurl) 
        { 
            string imgName = imgurl.Substring(imgurl.LastIndexOf('/'));//获取原来图片名称 
            WebRequest request = WebRequest.Create(imgurl); 
            WebResponse response = request.GetResponse(); 
            Stream reader = response.GetResponseStream(); 
            if (!Directory.Exists(@"D:tony")) { Directory.CreateDirectory(@"D:Tony"); } 
            FileStream writer = new FileStream(@"D:Tony" + imgName, FileMode.OpenOrCreate, FileAccess.Write); 
            byte[] buff = new byte[512]; 
            int c = 0; 
            while ((c = reader.Read(buff, 0, buff.Length)) > 0) 
            { 
                writer.Write(buff, 0, c); 
            } 
//此处可以过滤图片尺寸 
           using (Image img = Image.FromFile(@"D:Tony" + imgName + ".jpg")) 
            { 
                if (img.Size.Width > 100) 
                { 
                    Console.WriteLine(img.Size); 
                } 
            } 
            writer.Close(); 
            reader.Close(); 
            response.Close(); 
        } 
  
  
/// <summary> 
        /// 获取页面的HTML信息 
        /// </summary> 
        /// <param name="url">页面地址</param> 
        /// <param name="timeout">超时时间,单位:ms</param> 
        /// <param name="EnCodeType">编码</param> 
        /// <returns></returns> 
        static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType) 
        { 
            if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; } 
            string result = ""; 
            System.IO.StreamReader reader = null; 
            string temp = ""; 
            try 
            { 
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest 
                request.Timeout = timeout; 
                request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729)"; 
                request.Accept = "*/*"; 
                request.KeepAlive = true; 
                request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5"); 
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应 
                if (response.StatusCode == System.Net.HttpStatusCode.OK) 
                { 
                    StringBuilder builder = new StringBuilder(); 
                    Stream stream = response.GetResponseStream(); 
                    reader = new StreamReader(stream, EnCodeType); 
                    string tmp = ""; 
                    while ((temp = reader.ReadLine()) != null) 
                    { 
                        builder.Append(temp); 
                        tmp = builder.ToString(); 
                        builder.Append("rn"); 
                    } 
                    result = builder.ToString(); 
                    return result; 
                } 
                return string.Empty; 
            } 
            catch (Exception ex) 
            { 
                return ex.Message; 
            } 
            finally { if (reader != null) { reader.Close(); } } 
        } 
 


C#快速获取网页页面的标题

 代码如下

using System.Text.RegularExpressions; 
static string GetTitle(string html)
        {
            //string regex = @"(<title>)([sS]*)(</title>)";
            string regex = @"(?<=<title.*>)([sS]*)(?=</title>)";
            //正向预搜索与反向预搜索:http://www.rczjp.cn/HTML/120709/20120409090416.html
            Regex reg = new Regex(regex, RegexOptions.IgnoreCase);
            return reg.Match(html).Value.Trim();
        }
 
        static void Main(string[] args)
        {
            string html = GetHtmlInfo("www.rczjp.cn", 5000, Encoding.UTF8);
            Console.WriteLine(html);
            Console.WriteLine(GetTitle(html));
 
            Console.Read();
 
        }
 
        /// <summary>
        /// 获取页面的HTML信息,到标题(</title>)位置结束
        /// </summary>
        /// <param name="url">页面地址</param>
        /// <param name="timeout">超时时间,单位:ms</param>
        /// <param name="EnCodeType">编码</param>
        /// <returns></returns>
        static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType)
        {
            if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }
            string result = "";
            System.IO.StreamReader reader = null;
            string temp = "";
            try
            {
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest
                request.Timeout = timeout;
                request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729)";
                request.Accept = "*/*";
                request.KeepAlive = true;
                request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
                if (response.StatusCode == System.Net.HttpStatusCode.OK)
                {
                    StringBuilder builder = new StringBuilder();
                    Stream stream = response.GetResponseStream();
                    reader = new StreamReader(stream, EnCodeType);
                    string tmp = "";
                    while ((temp = reader.ReadLine()) != null)
                    {
                        builder.Append(temp);
                        tmp = builder.ToString();
                        if (tmp.IndexOf("</title>") > 0) { break; }//ReadLine是读取整行,所以有时在它后面的很多字符串也会读取
                        Console.WriteLine(tmp.IndexOf("</title>"));
                        builder.Append("rn");
                    }
                    result = builder.ToString();
                    return result;
                }
                return string.Empty;
            }
            catch (Exception ex)
            {
                return ex.Message;
            }
            finally { if (reader != null) { reader.Close(); } }
        }