使用c#实现爬虫技术

    xiaoxiao2022-07-12  177

    这是我的第一个爬虫项目,也是我第一次接触c# 窗体程序。 我的需求:页面中有音频文件但是它时单个下载的,用户需要一个一个的去点击下载按钮进行下载,我的目的:根据用户的需求筛选出相关的数据,然后我拿到页面上用户筛选的数据,实现批量下载,然后将下载并存放到用户本地文件夹中,然后对下载下来的这些文件进行播放。 主要用到的插件有:CefSharp HtmlAgilityPack 将浏览器页面嵌入到winForm中

    将web页面嵌入到winForm的界面中

    //窗体load时执行下面方法 private void Form1_Load(object sender, EventArgs e) { CefSettings settings = new CefSettings(); Cef.Initialize(settings); webbrowser = new ChromiumWebBrowser(“要嵌入的web地址”); webbrowser.Dock = DockStyle.Fill; this.pnlTop.Controls.Add(webbrowser); webbrowser.FrameLoadEnd += Webbrowser_FrameLoadEnd;//注册窗体加载事件onload webbrowser.FrameLoadEnd += SetCookie; }

    下面是获取web页面的url地址做相应的操作

    private void Webbrowser_FrameLoadEnd(object sender, FrameLoadEndEventArgs e) { if (e.Frame.IsMain) { if (e.Frame.Url == "页面的url地址(不同的地址处理不同的事情)") { string listPage = "想要跳转的页面地址"; string js = "window.location.href='" + listPage + "';"; this.webbrowser.ExecuteScriptAsync(js);//将这段js添加到web页面中,它会执行此跳转 return; } if (e.Frame.Url == "url1") { string html = ""; e.Frame.GetSourceAsync().ContinueWith(task =>//异步执行 { html = task.Result;//抓取到的页面,然后分析页面的代码结构拿到想要的数据 String filePath = SavaProcess(html); }); return; } if (e.Frame.Url == "url2") { e.Frame.GetSourceAsync().ContinueWith(task => { string htmlDom = task.Result; var doc = new HtmlDocument(); doc.LoadHtml(htmlDom);//可以将html页面,使可以用类似于操作dom的一些方法来操作 //拿到总页数 request requoption = new request(); requoption.Method = "POST"; //下面是根据抓取到的实际的页面结构,和具体的也去需求,去获取页面上的数据 var pageTr = doc.DocumentNode.SelectNodes(@"/html[1]/body[1]/div[3]/table[1]/tbody[1]/tr[@class='forPage']/td[1]/div[1]/div[1]");//选择标签数组 if (pageTr.Count > 0) { var p = pageTr[0]; var spanNodes = pageTr[0].SelectNodes(@".//span");//取到该节点下的所有span节点 } } }); return; } } }

    设置cookie方法

    private void SetCookie(object sender, CefSharp.FrameLoadEndEventArgs e) { var cookieManager = CefSharp.Cef.GetGlobalCookieManager(); CookieVisitor visitor = new CookieVisitor(); visitor.SendCookie += Visitor_SendCookie; cookieManager.VisitAllCookies(visitor); } /// <summary> /// 将Cookie保存到字典COOKIES中 /// </summary> /// <param name="obj"></param> private void Visitor_SendCookie(CefSharp.Cookie obj) { lock (lockObject) { string key = obj.Domain.TrimStart('.') + "^" + obj.Name; string value = obj.Value; if (!cookies.ContainsKey(key)) { cookies.Add(key, value); } else { cookies[key] = value; } } } /// <summary> /// 将COOKIES解析成System.Net.Cookie /// </summary> /// <returns></returns> private CookieCollection GetCookieCollection() { lock (lockObject) { CookieCollection cookieCollection = new CookieCollection(); foreach (var keyValue in cookies) { System.Net.Cookie cookie = new System.Net.Cookie(); cookie.Domain = keyValue.Key.Split('^')[0]; cookie.Name = keyValue.Key.Split('^')[1]; cookie.Value = keyValue.Value; cookieCollection.Add(cookie); } return cookieCollection; } }

    下面是已经拿到音频文件的地址了,然后请求下载地址下载文件

    /// <summary> /// 将文件下载到本地 /// </summary> public void HttpWebRequestGet(Uri url, string fileName, DataModel data) { try { HttpWebRequest AudioReq = (HttpWebRequest)HttpWebRequest.Create(url); AudioReq.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"; AudioReq.KeepAlive = true; AudioReq.Referer = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; AudioReq.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"; AudioReq.Headers.Set("Accept-Encoding", "gzip,deflate"); AudioReq.Headers.Set("Accept-Language", "zh-CN,zh;q=0.9"); AudioReq.Headers.Set("Upgrade-Insecure-Requests", "1"); AudioReq.Headers.Set("Cookie", "JSESSIONID=" + JSESSIONID + ";rememberPass=1;userAccount=" + uid + ";#pwd=" + pwd + ";loginByTwoCode=0"); string responseData = String.Empty; AudioReq.Method = "GET"; AudioReq.ContentType = "application/x-www-form-urlencoded"; string path = System.AppDomain.CurrentDomain.BaseDirectory + @"AudioList\AMR"; if (!System.IO.Directory.Exists(path)) { System.IO.Directory.CreateDirectory(path); } HttpWebResponse rsp = (HttpWebResponse)AudioReq.GetResponse();//获取回写流 //将文件存到本地 var localAmrnb = path + "\\" + fileName; FileStream fs = new FileStream(localAmrnb, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);//创建本地文件写入流 data.LocalPath = localAmrnb; var responseStream = rsp.GetResponseStream(); //创建本地文件写入流 byte[] bArr = new byte[1024]; int iTotalSize = 0; int size = responseStream.Read(bArr, 0, (int)bArr.Length); while (size > 0) { iTotalSize += size; fs.Write(bArr, 0, size); size = responseStream.Read(bArr, 0, (int)bArr.Length); } fs.Close(); responseStream.Close(); rsp.Close(); rsp.Dispose(); } catch (Exception ex) { ex.ToString(); } }

    c#序列化数据并写入文件 List dataList = new List(); System.IO.StreamWriter file1 = new System.IO.StreamWriter(DownloadDataPath, false); file1.Write(new JavaScriptSerializer().Serialize(dataList)); file1.Close(); file1.Dispose(); 从文件中读取数据并反序列化 using (System.IO.StreamReader sr = new System.IO.StreamReader(DownloadDataPath, Encoding.UTF8)) { // 从文件读取并显示行,直到文件的末尾 string line = sr.ReadLine(); if (line != null) { oldData = line; } } System.IO.StreamWriter file2 = new System.IO.StreamWriter(DownloadDataPath, false); List oldDataList = new JavaScriptSerializer().Deserialize<List>(oldData);//反序列化读取到的值 dataList.AddRange(oldDataList);//将新的数据添加到之前数据的末尾 file2.Write(new JavaScriptSerializer().Serialize(dataList)); file2.Close(); file2.Dispose();

    下面向窗体中添加mediaPlay播放器 首先添加引用如下图所示: 其次将mediaPlayer组件添加到工具箱中,菜单栏:工具—>选择工具箱选项,添加如下组件 添加完之后就可以在工具箱中将组件直接拖到界面上了,

    具体实现播放的代码如下所示

    public Boolean getMediaPlayData() { this.playMedia.currentPlaylist.clear(); for (int i = 0; i < oldDataList.Count; i++) { this.playMedia.currentPlaylist.appendItem(playMedia.newMedia(oldDataList[i].LocalPath));//将所有要播放的文件添加到播放列表 } return true; } /// <summary> /// 点击查询并播放按钮 /// </summary> /// <returns></returns> private void button1_Click(object sender, EventArgs e) { if (getMediaPlayData()) { this.playMedia.settings.autoStart = true; this.playMedia.settings.setMode("shuffle", false); this.playMedia.Ctlcontrols.play(); } } private void wmp_PlayStateChange(object sender, AxWMPLib._WMPOCXEvents_PlayStateChangeEvent e) { //如果已播放完毕就播放下一个文件 if ((WMPLib.WMPPlayState)e.newState == WMPLib.WMPPlayState.wmppsReady) playMedia.Ctlcontrols.play(); }

    以上不是完整的代码。 总体来说把大致的过程和用到的一些技术记录下来,加深记忆。

    最新回复(0)