htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载 / 憋错料

上次学了jsoup之后，发现一些动态生成的网页内容是无法抓取的，于是又学习了htmlunit，下面是抓取酷狗音乐与qq音乐链接的例子：

酷狗音乐：

import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLEncoder;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.nodes.Element;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;

public class worm7 {
	 private static String name="离骚";
     public static WebClient getWebClient(boolean flag){
    	 WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45);
    	 webClient.getOptions().setUseInsecureSSL(true);
    	 webClient.getOptions().setCssEnabled(false);
         webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
         webClient.getOptions().setThrowExceptionOnScriptError(false);
         webClient.getOptions().setRedirectEnabled(true);
         webClient.getOptions().setAppletEnabled(false);
         webClient.getOptions().setJavaScriptEnabled(flag);
         webClient.getOptions().setTimeout(60000);
         webClient.getOptions().setPrintContentOnFailingStatusCode(false);
         webClient.setAjaxController(new NicelyResynchronizingAjaxController());
         return webClient;
     }
     public static String getMp3Url(WebClient webClient){
    	 FileOutputStream outputStream = null;
         InputStream inputStream = null;
         BufferedInputStream bis = null;
    	try {
			Page page=webClient.getPage("http://songsearch.kugou.com/song_search_v2?"
					+ "callback=jQuery112408395432201569397_1532930925600"
					+ "&keyword="+URLEncoder.encode(name, "utf-8")
					+ "&page=1"
					+ "&pagesize=30"
					+ "&userid=-1"
					+ "&clientver="
					+ "&platform=WebFilter"
					+ "&tag=em"
					+ "&filter=2"
					+ "&iscorrection=1"
					+ "&privilege_filter=0"
					+ "&_="+System.currentTimeMillis());
			//System.out.println(page.getWebResponse().getContentAsString());
			//System.out.println(zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))"));
			JSONObject job=JSONObject.parseObject("{"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("data");
			System.out.println("job:"+job);
			JSONArray list=job.getJSONArray("lists");
			System.out.println("list"+list);
			for(int i=0;i<list.size();i++){
				String id1=list.getJSONObject(i).getString("FileHash");
				String id2=list.getJSONObject(i).getString("AlbumID");
				String detailUrl="http://www.kugou.com/yy/index.php?r=play/getdata"
						+ "&hash="+id1
						+ "&album_id="+id2
						+ "&_="+System.currentTimeMillis();
				Page page2=webClient.getPage(detailUrl);
				JSONObject job2=JSONObject.parseObject(page2.getWebResponse().getContentAsString()).getJSONObject("data");
				System.out.println("标题："+job2.getString("audio_name"));
				//System.out.println("歌词："+job2.getString("lyrics"));
				System.out.println("mp3："+job2.getString("play_url"));

	                String outImage = job2.getString("audio_name")+ ".mp3";
	                URL imgUrl = new URL(job2.getString("play_url"));//获取输入流
	                inputStream = imgUrl.openConnection().getInputStream();
	                //将输入流信息放入缓冲流提升读写速度
	                bis = new BufferedInputStream(inputStream);
	                //读取字节娄
	                byte[] buf = new byte[1024];
	                //生成文件
	                outputStream = new FileOutputStream("f://"+ outImage);
	                int size = 0;
	                //边读边写
	                while ((size = bis.read(buf)) != -1) {
	                     outputStream.write(buf, 0, size);
	                }
	                //刷新文件流
	                outputStream.flush();

			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return name;

     }
     private static String zzee(String str, String zz) {
 		String list = null;
 		Pattern p = Pattern.compile(zz);
 		Matcher m = p.matcher(str);
 		while (m.find()) {
 			list = m.group();
 		}

 		return list;
 	}
	public static void main(String[] args) {
		WebClient webClient=getWebClient(false);
		getMp3Url(webClient);
	}
}

　　运行结果：

qq音乐抓取实例：

import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.nodes.Element;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;

public class worm6 {
	 private static String name="离骚";
	 static String id1=null;
	 static String id2=null;
	 static String id3=null;
	 static String id4=null;
	 static String name1=null;
	 static String name2=null;
	 static String url = null;
	 static JSONObject  job2=null;
     public static WebClient getWebClient(boolean flag){
    	 WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45);
    	 webClient.getOptions().setUseInsecureSSL(true);
    	 webClient.getOptions().setCssEnabled(false);
         webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
         webClient.getOptions().setThrowExceptionOnScriptError(false);
         webClient.getOptions().setRedirectEnabled(true);
         webClient.getOptions().setAppletEnabled(false);
         webClient.getOptions().setJavaScriptEnabled(flag);
         webClient.getOptions().setTimeout(60000);
         webClient.getOptions().setPrintContentOnFailingStatusCode(false);
         webClient.setAjaxController(new NicelyResynchronizingAjaxController());
         return webClient;
     }
     public static String getMp3Url(WebClient webClient){

    	try {
			Page page=webClient.getPage("https://c.y.qq.com/soso/fcgi-bin/client_search_cp?"
					+ "ct=24"
					+ "&qqmusic_ver=1298"
					+ "&new_json=1"
					+ "&remoteplace=txt.yqq.center"
					+ "&searchid=36047978388657978"
					+ "&t=0"
					+ "&aggr=1"
					+ "&cr=1"
					+ "&catZhida=1"
					+ "&lossless=0"
					+ "&p=1"
					+ "&n=20"
					+ "&w="+URLEncoder.encode(name, "utf-8")
					+ "&g_tk=5381"
					+ "&jsonpCallback=MusicJsonCallback6176591962889693"
					+ "&loginUin=0"
					+ "&hostUin=0"
					+ "&format=jsonp"
					+ "&inCharset=utf8"
					+ "&outCharset=utf-8"
					+ "&notice=0"
					+ "&platform=yqq"
					+ "&needNewCode=0"
					);
			//System.out.println("page:"+page);
			//System.out.println("------"+page.getWebResponse().getContentAsString());
			//System.out.println("======"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))"));

			JSONObject job=JSONObject.parseObject("{"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("data");
			//System.out.println("job:"+job);
			String job0=job.getString("song");
			//System.out.println("job0"+job0);
			job=JSON.parseObject(job0);
			JSONArray list=job.getJSONArray("list");
			//System.out.println("list:"+list);
			for(int i=0;i<list.size();i++){
				id1=list.getJSONObject(i).getString("mid");
				//System.out.println("id1"+id1);
				id2=list.getJSONObject(i).getString("file");
				//System.out.println("id"+id2);
				id2="C400"+JSONObject.parseObject(id2).getString("media_mid")+".m4a";
				//System.out.println("id"+id2);
				name1=list.getJSONObject(i).getString("title");
				name2=list.getJSONObject(i).getString("singer");
				//System.out.println(name2);
				JSONArray name=JSON.parseArray(name2);
				//System.out.println("job4:"+name);
				name2=name.getJSONObject(0).getString("name");
				//System.out.println(name.getJSONObject(0).getString("name"));

				/*String detailUrl="https://c.y.qq.com/v8/fcg-bin/fcg_play_single_song.fcg?"
						+ "songmid="+id1
						+ "&tpl=yqq_song_detail&format=jsonp&callback=getOneSongInfoCallback&g_tk=5381&jsonpCallback=getOneSongInfoCallback&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0"
						;
				Page page2=webClient.getPage(detailUrl);
				//System.out.println(page2);
				String b="{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}";
				//System.out.println("b"+b);
				JSONObject job1=JSONObject.parseObject("{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("url");
				System.out.println("job1:"+job1);
				String job2=job1.getString(id2);

				System.out.println("job2"+job2);*/
				String url1="https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg?g_tk=5381&jsonpCallback=MusicJsonCallback32651599216689386&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0&cid=205361747&callback=MusicJsonCallback32651599216689386&uin=0"
						+"&songmid="+id1
						+"&filename="+id2
						+"&guid=2241489759";
						;
						Page page2=webClient.getPage(url1);
						//System.out.println("page2"+page2);
						JSONObject job2=JSONObject.parseObject("{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("data");
						//System.out.println("标题："+job2.getString("items"));
						String job3=job2.getString("items");
						JSONArray job4=JSON.parseArray(job3);
						//System.out.println("job4:"+job4);
						//System.out.println(job4.getJSONObject(0).getString("vkey"));
						url ="http://dl.stream.qqmusic.qq.com/"+id2+"?vkey="+job4.getJSONObject(0).getString("vkey")+"&guid=2241489759&uin=0&fromtag=66";
						System.out.println("name:"+name1+"--"+name2);
						System.out.println("url:"+url);

						download();
			}

		} catch (Exception e) {
			e.printStackTrace();
		}
		return name;

     }
     private static String zzee(String str, String zz) {
 		String list = null;
 		Pattern p = Pattern.compile(zz);
 		Matcher m = p.matcher(str);
 		while (m.find()) {
 			list = m.group();
 		}

 		return list;
 	}
     private static void download() throws IOException{
    	 FileOutputStream outputStream = null;
         InputStream inputStream = null;
         BufferedInputStream bis = null;
    	 String outImage = name1+"--"+name2+ ".mp3";
         URL imgUrl = new URL(url);//获取输入流
         inputStream = imgUrl.openConnection().getInputStream();
         //将输入流信息放入缓冲流提升读写速度
         bis = new BufferedInputStream(inputStream);
         //读取字节娄
         byte[] buf = new byte[1024];
         //生成文件
         outputStream = new FileOutputStream("f://"+ outImage);
         int size = 0;
         //边读边写
         while ((size = bis.read(buf)) != -1) {
              outputStream.write(buf, 0, size);
         }
         //刷新文件流
         outputStream.flush();
     }
	public static void main(String[] args) {
		WebClient webClient=getWebClient(false);
		getMp3Url(webClient);
	}
}

运行结果：

相比之下，酷狗音乐相对好爬一些，QQ音乐有些繁琐。。。

原文地址：https://www.cnblogs.com/xr210/p/9404325.html

时间： 2024-10-10 06:57:53

htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载

htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载的相关文章

Java爬虫系列之实战：爬取酷狗音乐网 TOP500 的歌曲

python使用beautifulsoup4爬取酷狗音乐

爬虫程序2-爬取酷狗top500

java使用htmlunit工具抓取js中加载的数据

Java实现简易爬虫--抓取酷安网用户头像

抓取一个网站的所有网址链接

python爬虫：使用urllib.request和BeautifulSoup抓取新浪新闻标题、链接和主要内容

Java爬虫实战（一）：抓取一个网站上的全部链接

使用Jsoup抓取网站上的图片、链接