[jsoup] - 简易爬虫

    xiaoxiao2025-01-12  10

    从网站中爬取文字和图片

    public class TravelSpider { public static void main(String[] args) throws Exception { String url = "http://www.jinmalvyou.com/search/index/view_type/1/keyword/国内"; fetchTravelData(url); } private static void fetchTravelData(String url) throws Exception { //1.读取url,得到Document对象 Document document = Jsoup.connect(url).get(); //2.获取到这一页所有的旅游路线信息 Elements elements = document.select(".rl-b-li"); //3.循环处理每个路线信息 for (Element element : elements) { //3.1 获取路线名称 Elements rnameElement = element.select(".pro-title>a"); String rname = rnameElement.text(); System.out.println("路线名称:" + rname); //3.2 获取路线介绍 Elements introduceElements = element.select(".pro-colomn"); Element introduceElement = introduceElements.get(0); String routeIntroduce = introduceElement.text(); System.out.println("路线介绍:" + routeIntroduce); //3.3 获取路线价格 Elements priceElements = element.select(".price>strong"); String price = priceElements.text(); System.out.println("路线价格:" + price); //3.4 获取路线图片 Elements rimageElements = element.select(".pro-img img"); String rimageSrc = "http:" + rimageElements.attr("src"); String localPath = saveImage(rimageSrc); System.out.println("路线图片:" +localPath); System.out.println("-----------------------------------"); } //4.爬取下一页的内容 Elements nextElements = document.select("a.next"); if (nextElements != null && !nextElements.isEmpty()) { String nextUrl = "http://www.jinmalvyou.com" + nextElements.attr("href"); fetchTravelData(nextUrl); } } /** * Java程序发HTTP请求,得到图片,保存到本地。返回图片在本地的保存路径 * @param rimageSrc * @return */ private static String saveImage(String rimageSrc) throws IOException { //从图片路径里,得到图片名称:http://img.jinmalvyou.com/20190516/goods_thumb_22624_330_195.jpeg int index = rimageSrc.lastIndexOf("/"); String rimageName = rimageSrc.substring(index + 1); //goods_thumb_22624_330_195.jpeg String localPath = "E:\\63\\travelImages\\" + rimageName; //1.创建一个客户端对象,相当于我们的浏览器 CloseableHttpClient client = HttpClients.createDefault(); //2.创建HTTP请求对象 HttpGet get = new HttpGet(rimageSrc); //3.使用客户端发送HTTP请求 CloseableHttpResponse response = client.execute(get); //4.判断响应状态 if (response.getStatusLine().getStatusCode() == 200) { //5.得到响应结果 HttpEntity entity = response.getEntity(); //6.得到响应的内容 响应体 InputStream inputStream = entity.getContent(); //7.创建一个输出流 FileOutputStream outputStream = new FileOutputStream(localPath); //8.把数据写到输出流里 /*int len = -1; byte[] buffer = new byte[1024 * 8]; while ((len = inputStream.read(buffer)) != -1) { outputStream.write(buffer, 0, len); }*/ IOUtils.copy(inputStream, outputStream); inputStream.close(); outputStream.close(); } return localPath; } }
    最新回复(0)