我们经常会使用程序读取百度的搜索结果以便提供方便。现在很多程序使用.NET来做定时任务,这样怎样使用.NET/C# 获取百度搜索结果呢?
我们首先应该分析百度的搜索结果,发现百度的搜索结果的格式为:
图中标记部分可以知道,百度的搜索结果都是在id="content_left"的结果中的,每个搜索项目的是以class="result c-container"作为一项, 每项中的题目又是包含在h3标签中,如下图所示:
因此我们有了思路:
直接来干的,看下面的代码:
using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Web; using System.Net; using System.IO; namespace BaiduSearchTest { struct BaiduEntry { public string title, brief, link; } class Program { static string GetHtml(string keyword) { string url = @"http://www.baidu.com/"; string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936)); //百度使用codepage 936字符编码来作为查询串,果然专注于中文搜索…… //更不用说,还很喜欢微软 //谷歌能正确识别UTF-8编码和codepage这两种情况,不过本身网页在HTTP头里标明是UTF-8的 //估计谷歌也不讨厌微软(以及微软的专有规范) string query = "s?wd=" + encodedKeyword; HttpWebRequest req; HttpWebResponse response; Stream stream; req = (HttpWebRequest)WebRequest.Create(url + query); response = (HttpWebResponse)req.GetResponse(); stream = response.GetResponseStream(); int count = 0; byte[] buf = new byte[8192]; string decodedString = null; StringBuilder sb = new StringBuilder(); try { Console.WriteLine("正在读取网页{0}的内容……", url + query); do { count = stream.Read(buf, 0, buf.Length); if (count > 0) { decodedString = Encoding.GetEncoding("utf-8").GetString(buf, 0, count); sb.Append(decodedString); } } while (count > 0); } catch { Console.WriteLine("网络连接失败,请检查网络设置。"); } return sb.ToString(); } static void PrintResult(Listentries) { int count = 0; entries.ForEach(delegate(BaiduEntry entry) { Console.WriteLine("找到了百度的第{0}条搜索结果:", count += 1); if (entry.link != null) { Console.WriteLine("找到了一条链接:"); Console.WriteLine(entry.link); } if (entry.title != null) { Console.WriteLine("标题为:"); Console.WriteLine(entry.title); } if (entry.brief != null) { Console.WriteLine("下面是摘要:"); Console.WriteLine(entry.brief); } Program.Cut(); }); } static void simpleOutput() { string html = " "; Console.WriteLine(RemoveSomeTags(html)); } static string RemoveVoidTag(string html) { string[] filter = { "
testhello
" }; foreach (string tag in filter) { html = html.Replace(tag, ""); } return html; } static string ReleaseXmlTags(string html) { string[] filter = { "", "", "", "", "", "", " ", "" }; foreach (string tag in filter) { html = Regex.Replace(html, tag, ""); } return html; } static string RemoveSomeTags(string html) { html = RemoveVoidTag(html); html = ReleaseXmlTags(html); return html; } static void Cut() { Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); } static void MainProc(string input) { MainProc(input, false); } static void MainProc(string input, bool tagsForBrief) { Regex r = new Regex(@" ", RegexOptions.IgnoreCase); MatchCollection matchCollection = r.Matches(input); List
collection = new List (); foreach(Match m in matchCollection) { string textReg = @"]*>([\s\S]+?)"; MatchCollection textMatchCollection = Regex.Matches(m.Value, textReg, RegexOptions.IgnoreCase); foreach (Match match in textMatchCollection) { if (match.Success) Console.Write(match.Result("$1")); } string LinkReg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; MatchCollection linkMatchCollection = Regex.Matches(m.Value, LinkReg, RegexOptions.IgnoreCase); foreach (Match match in linkMatchCollection) { if (match.Success) Console.Write(match.Groups[0].Value); } } } public static void Main(string[] args) { Console.WriteLine("请输入一个关键字。"); string keyword; keyword = Console.ReadLine(); Console.WriteLine("正在从百度上获取结果,请稍等……"); string input; input = GetHtml(keyword); Regex r = new Regex(" ", RegexOptions.IgnoreCase); input = r.Match(input).Value; MainProc(input); Console.ReadKey(true); } } }程序结果如下图所示:
通过上面的例子你应该明白怎样使用.NET/C# 获取百度搜索结果项了吧,程序可以直接使用,如果没有得到结果说明是百度搜索的结构变了,请按程序思路改正。