怎样使用.NET/C# 获取百度搜索结果2015

我们经常会使用程序读取百度的搜索结果以便提供方便。现在很多程序使用.NET来做定时任务，这样怎样使用.NET/C# 获取百度搜索结果呢？

我们首先应该分析百度的搜索结果，发现百度的搜索结果的格式为：

图中标记部分可以知道，百度的搜索结果都是在id="content_left"的结果中的，每个搜索项目的是以class="result c-container"作为一项，每项中的题目又是包含在h3标签中，如下图所示：

因此我们有了思路：

根据关键字获取到百度搜索结果的整个HTML文本

正则匹配到搜索结果容器的HTML

正则匹配到搜索结果每一项的HTML

取出每项结果中的题目和链接地址

直接来干的，看下面的代码：

using System;
 using System.Collections.Generic;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Web;
 using System.Net;
using System.IO;
namespace BaiduSearchTest
{
    struct BaiduEntry
    {
        public string title, brief, link;
    }
    class Program
    {
        static string GetHtml(string keyword)
        {
            string url = @"http://www.baidu.com/";
            string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936));
            //百度使用codepage 936字符编码来作为查询串，果然专注于中文搜索……
            //更不用说，还很喜欢微软
            //谷歌能正确识别UTF-8编码和codepage这两种情况，不过本身网页在HTTP头里标明是UTF-8的
            //估计谷歌也不讨厌微软（以及微软的专有规范）
            string query = "s?wd=" + encodedKeyword;

            HttpWebRequest req;
            HttpWebResponse response;
            Stream stream;
            req = (HttpWebRequest)WebRequest.Create(url + query);
            response = (HttpWebResponse)req.GetResponse();
            stream = response.GetResponseStream();
            int count = 0;
            byte[] buf = new byte[8192];
            string decodedString = null;
            StringBuilder sb = new StringBuilder();
            try
            {
                Console.WriteLine("正在读取网页{0}的内容……", url + query);
                do
                {
                    count = stream.Read(buf, 0, buf.Length);
                    if (count > 0)
                    {
                        decodedString = Encoding.GetEncoding("utf-8").GetString(buf, 0, count);
                        sb.Append(decodedString);
                    }
                } while (count > 0);
            }
            catch
            {
                Console.WriteLine("网络连接失败，请检查网络设置。");
            }
            return sb.ToString();
        }
        static void PrintResult(List entries)
        {
            int count = 0;
            entries.ForEach(delegate(BaiduEntry entry)
            {
                Console.WriteLine("找到了百度的第{0}条搜索结果：", count += 1);
                if (entry.link != null)
                {
                    Console.WriteLine("找到了一条链接：");
                    Console.WriteLine(entry.link);
                }
                if (entry.title != null)
                {
                    Console.WriteLine("标题为：");
                    Console.WriteLine(entry.title);
                }
                if (entry.brief != null)
                {
                    Console.WriteLine("下面是摘要：");
                    Console.WriteLine(entry.brief);
                }
                Program.Cut();
            });
        }
        static void simpleOutput()
        {
            string html = "testhello
";
            Console.WriteLine(RemoveSomeTags(html));
        }
        static string RemoveVoidTag(string html)
        {
            string[] filter = { "
" };
            foreach (string tag in filter)
            {
                html = html.Replace(tag, "");
            }
            return html;
        }
        static string ReleaseXmlTags(string html)
        {
            string[] filter = { "", "", "", "", "", "", "", "" };
            foreach (string tag in filter)
            {
                html = Regex.Replace(html, tag, "");
            }
            return html;
        }

        static string RemoveSomeTags(string html)
        {
            html = RemoveVoidTag(html);
            html = ReleaseXmlTags(html);
            return html;
        }
        static void Cut()
        {
            Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
        }
        static void MainProc(string input)
        {
            MainProc(input, false);
        }

        static void MainProc(string input, bool tagsForBrief)
        {
            Regex r = new Regex(@"", RegexOptions.IgnoreCase);

            MatchCollection matchCollection = r.Matches(input);
            List collection = new List();

            foreach(Match m in matchCollection)
            {

                string textReg = @"]*>([\s\S]+?)";

                MatchCollection textMatchCollection = Regex.Matches(m.Value, textReg, RegexOptions.IgnoreCase);

                foreach (Match match in textMatchCollection)
                {
                    if (match.Success)
                        Console.Write(match.Result("$1"));
                }

                string LinkReg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

                MatchCollection linkMatchCollection = Regex.Matches(m.Value, LinkReg, RegexOptions.IgnoreCase);

                foreach (Match match in linkMatchCollection)
                {
                    if (match.Success)
                        Console.Write(match.Groups[0].Value);
                }
            }
        }
        public static void Main(string[] args)
        {
            Console.WriteLine("请输入一个关键字。");
            string keyword;
            keyword = Console.ReadLine();
            Console.WriteLine("正在从百度上获取结果，请稍等……");
            string input;
            input = GetHtml(keyword);
            Regex r = new Regex("", RegexOptions.IgnoreCase);
            input = r.Match(input).Value;
            MainProc(input);
            Console.ReadKey(true);
        }
    }
}