请问c#如何获取网页里所有的链接完整url地址?
网友回复
using System; using System.Net; using System.Text; using System.Text.RegularExpressions; namespace HelloWorldApplication { class HelloWorld { static void Main(string[] args) { System.Net.WebClient client = new WebClient(); byte[] page = client.DownloadData("http://www.bfw.wiki"); string content = System.Text.Encoding.UTF8.GetString(page); string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']"; Regex re = new Regex(regex); MatchCollection matches = re.Matches(content); System.Collections.IEnumerator enu = matches.GetEnumerator(); while (enu.MoveNext() && enu.Current != null) { Match match = (Match)(enu.Current); Console.Write(match.Value + "\r\n"); } } } }
这样也可以
using System; using System.IO; using System.Net; using System.Text; using System.Text.RegularExpressions; namespace HelloWorldApplication { class HelloWorld { public static string GetHtml(string url) { string res = ""; WebClient client = new WebClient(); Stream stream = client.OpenRead(url); StreamReader sr = new StreamReader(stream, Encoding.Default); res = sr.ReadToEnd(); sr.Close(); client.Dispose(); return res; } static void Main(string[] args) { //爬出网页html代码 string data=GetHtml("http://www.bfw.wiki"); //正则匹配链接地址 string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']"; Regex re = new Regex(regex); MatchCollection matches = re.Matches(data); System.Collections.IEnumerator enu = matches.GetEnumerator(); while (enu.MoveNext() && enu.Current != null) { //打印地址 Match match = (Match)(enu.Current); Console.Write(match.Value + "\r\n"); } } } }