首发于 菜菜编程

java爬虫爬取小说

首先要介绍这个小说网站

输入一个小说名

返回小说列表

然后根据选择的对用小说,得到章节目录列表。

然后根据隐藏在html中的href,爬取具体小说内容。


介绍一下开发环境

Maven项目

依赖

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlcleaner/htmlcleaner -->
<dependency>
    <groupId>net.sourceforge.htmlcleaner</groupId>
    <artifactId>htmlcleaner</artifactId>
    <version>2.21</version>
</dependency>

几个简单的bean

package com.ym.reptile.bean;

public class Chapter {
    private String id;
    private String name;
    private String url;

    public Chapter(String id, String name, String url) {
        this.id = id;
        this.name = name;
        this.url = url;
    }

    public Chapter() {
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    @Override
    public String toString() {
        return "Chapter{" +
                "id='" + id + '\'' +
                ", name='" + name + '\'' +
                ", url='" + url + '\'' +
                '}';
    }

    public void doSPrint(){
        System.out.println("id="+id+"\tname="+name+"\thref="+url);
    }
}

Fiction类

package com.ym.reptile.bean;

import java.util.List;

public class Fiction {
    private String name;
    private List<String> sections;

    public Fiction(String name, List<String> sections) {
        this.name = name;
        this.sections = sections;
    }

    public Fiction() {
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<String> getSections() {
        return sections;
    }

    public void setSections(List<String> sections) {
        this.sections = sections;
    }

    @Override
    public String toString() {
        return "Fiction{" +
                "name='" + name + '\'' +
                ", sections=" + sections +
                '}';
    }


    public void print(){
        System.out.println(name);
        for (String s:
             sections) {
            System.out.println("\t"+s);
        }
    }
}


Production类

package com.ym.reptile.bean;

public class Production {

    private int id;
    private String name;
    private String ChapterDirectoryHref;
    private String author;

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public String getChapterDirectoryHref() {
        return ChapterDirectoryHref;
    }

    public void setChapterDirectoryHref(String chapterDirectoryHref) {
        ChapterDirectoryHref = chapterDirectoryHref;
    }

    public Production(int id, String name, String author) {
        this.id = id;
        this.name = name;
        this.author = author;
    }

    public Production(String name, String author) {
        this.name = name;
        this.author = author;
    }

    public Production() {
    }

    @Override
    public String toString() {
        return "Production{" +
                "id=" + id +
                ", name='" + name + '\'' +
                ", ChapterDirectoryHref='" + ChapterDirectoryHref + '\'' +
                ", author='" + author + '\'' +
                '}';
    }

    public void print(){
        System.out.println(id+"\t"+name+"\t"+author+"\t"+getChapterDirectoryHref());
    }

}

可能会用到的几个工具类


package com.ym.reptile.util;

public class Number {


    char[] danWei = { '万','千', '百', '十' };

    /**
     * 把单位数字转换成汉字--->作为返回值
     */
    public char hanZi(int i) {
        switch (i) {
            case 1:
                return '一';
            case 2:
                return '二';
            case 3:
                return '三';
            case 4:
                return '四';
            case 5:
                return '五';
            case 6:
                return '六';
            case 7:
                return '七';
            case 8:
                return '八';
            case 9:
                return '九';
            default:
                return '零';
        }
    }
    /**
     * 把char类型的单位数字转换成int类型返回
     * @param c
     * @return
     */
    public int charToInt(char c) {
        switch (c) {
            case '1':
                return 1;
            case '2':
                return 2;
            case '3':
                return 3;
            case '4':
                return 4;
            case '5':
                return 5;
            case '6':
                return 6;
            case '7':
                return 7;
            case '8':
                return 8;
            case '9':
                return 9;
            default:
                return 0;
        }
    }
    /**
     * 返回数字的长度,即该数字有几位
     * @param strNumber
     * @return
     */
    public int numberLength(String strNumber) {
        return strNumber.length();
    }
    /**
     * 将int类型的数字转换为String类型返回
     * @param i
     * @return
     */
    public String numberToString(int i) {
        return String.valueOf(i);
    }

    public void output() {
        for (int i = 0; i < 10000; i++) {
            if (i < 10) {
                System.out.println(hanZi(i));
            } else {
                //char类型的数组,用来存储每位的数字
                char[] numToChar = new char[100];
                String numberToString = numberToString(i);
                //int类型的数组,用来存储每位的数字
                int[] arrayNumber = new int[100];
                //numLength获取数字的位数
                int numLength = numberLength(numberToString);
                //左到右遍历数字,分别存入char和int数组
                for (int j = 0; j < numLength; j++) {
                    numToChar[j] = numberToString.charAt(j);
                    arrayNumber[j] = charToInt(numToChar[j]);
                }
                // 数字-->数字数组 正确
                //获取单位下标的起始位置
                int danWeiIndex = danWei.length - numLength;
                //从左到右遍历数字
                for (int k = 0; k < numLength; k++) {
                    // 第K位的数字
                    int everyStateNum = arrayNumber[k];
                    boolean flag = false;
                    if (everyStateNum == 0) {
                        // 左到右遍历,碰到第一个0开始如果后面一直是零则不输出,否则输出一个零
                        for (int q = k + 1; q < numLength; q++) {
                            danWeiIndex++;
                            if (arrayNumber[q] != 0) {
                                k = q - 1;
                                System.out.print("零");
                                flag = true;
                                break;
                            }
                            if (danWeiIndex == danWei.length) {
                                break;
                            }
                        }
                        if (flag) {
                            continue;
                        } else {
                            break;
                        }
                    } else {
                        System.out.print(hanZi(everyStateNum));
                        danWeiIndex++;
                        //控制单位数组的下标不能越界,到个位数字时候直接输出数字,但不输出单位
                        if (danWeiIndex == danWei.length) {
                            break;
                        }
                        System.out.print(danWei[danWeiIndex]);
                    }
                }
                System.out.println();
            }
        }
    }

}

爬虫工具类Reptile

package com.ym.reptile.util;


import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Map;

/*
* @author ym
* date 2019-08-07
* */
public class Reptile {

    //get HTML pages by url
    public StringBuilder fromURLGetHtml(String url){
        StringBuilder ret = new StringBuilder();
        try {
            URL realUrl = new URL(url);
            //连接url
            URLConnection connection = realUrl.openConnection();
            //读取url流
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
            while(bufferedReader.readLine() != null) {
                String str = bufferedReader.readLine();
                ret.append(str);
            }
            bufferedReader.close();
        }catch(MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        return ret;
    }


    public void sendPost(String url, Map<String, Object> params) throws IOException {
        String response = null;


    }


    public StringBuilder getPage(String url) throws IOException {
        StringBuilder ret = new StringBuilder();
        HttpClientBuilder builder = HttpClients.custom();
        CloseableHttpClient client = builder.build();

        HttpGet request = new HttpGet(url);

        CloseableHttpResponse response =  client.execute(request);
        HttpEntity entity = response.getEntity();
        System.out.println(response.getStatusLine());
        ret.append(EntityUtils.toString(entity));
        return ret;
    }


    public String ChineseToEnglishTranslationLink(String query) throws Exception {
        String url = "http://dict.youdao.com/search";
        String newQuery = new StringOperation().toURLCode(query);
        url = url+"?q="+newQuery+"&keyfrom=new-fanyi.smartResult";
        return url;
    }

    public String fromEnglishGetDetail(String query) throws UnsupportedEncodingException {
        String url ="http://dict.youdao.com/w/eng/";
        String newQuery = new StringOperation().toURLCode(query);
        url+=newQuery+"/";
        return url;
    }





}

字符串编码处理类StringOperation

package com.ym.reptile.util;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;

public class StringOperation {


    public String toUpperString(String string){
       return string.toUpperCase();
    }

    public String toLowerString(String string){
        String s = string.toLowerCase();
        return s;
    }


    public boolean isLowercaseCharacter(char c){
     boolean ret = false;
     if(c>='a' && c<='z'){
         ret = true;
     }
     return ret;
    }

    public boolean isUppercaseCharacter(char c){
        boolean ret = false;
        if(c>='A' && c<='Z'){
            ret = true;
        }
        return ret;
    }

    public boolean isCharacter(char c){
        boolean ret = false;
        if((c>='A' && c<='Z') ||(c>='a' && c<='z')){
            ret = true;
        }
        return ret;
    }

    public String toUnicode(String s){
        char[] chars = s.toCharArray();
        StringBuilder builder = new StringBuilder();
        for(char c:chars){
            builder.append("\\u"+ Integer.toHexString((int)c)+"");
        }
        return builder.toString();
    }

    public String fromUnicode(String unicode){
        String[] ss = unicode.split("\\\\u");
        String ret = "";
        for(int i=1;i<ss.length;i++){
            ret += (char) Integer.valueOf(ss[i], 16).intValue();
        }
        return ret;
    }

    public String toURLCode(String s) throws UnsupportedEncodingException {
        return URLEncoder.encode(s,"UTF-8");
    }

    public String toURLCodeGBK(String s)throws UnsupportedEncodingException{
        return URLEncoder.encode(s,"gbk");
    }

    public String fromURLCode(String s) throws UnsupportedEncodingException {
        return URLDecoder.decode(s,"UTF-8");
    }

    public String Encode(String content, String code) throws UnsupportedEncodingException {
        return URLEncoder.encode(content,code);
    }

    public String Decode(String content, String code) throws UnsupportedEncodingException {
        return URLDecoder.decode(content,code);
    }

    public String deleteLeftAndRight(String s){
        return s.substring(1,s.length()-1);
    }

    public String deleteRedundantSpace(String initialString){
        return initialString.replaceAll("\\s+"," ");
    }

    private String deleteNumber(String s ){         //删除数字
        return s.replaceAll("\\d+", "").trim();
    }

    private int getNumber(String s){    //获得字符串中的数字
        String ret = "";
        for(int i =0;i<s.length();i++){
            if(s.charAt(i)>='0' &&s.charAt(i)<='9'){
                ret+=s.charAt(i);
            }
        }
        return Integer.parseInt(ret);
    }

}


query 的小说名,转url链接

分析header里面的query String Parameters部分

多尝试几次小说名之后,我们发现s对应的这个9157106854577873494值并不会改变

而q的值应该是query的urlcode形式,根据前面的ie=gbk,我们可以揣测如下代码

  public String toURLCodeGBK(String s)throws UnsupportedEncodingException{
        return URLEncoder.encode(s,"gbk");
   }
 private String getQueryCode(String query){
        String ret ="";
        try {
            return new StringOperation().toURLCodeGBK(query);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return ret;
    }
public String getChapterDirectoryUrl(String query){
       return "https://so.biqusoso.com/s.php?ie=gbk&siteid=biqugex.com&s="+s+"&q="+getQueryCode(query);
    }

    public List<Production> getALLProductions(String html){
        ArrayList<Production> ret = new ArrayList<Production>();
        Document document = Jsoup.parse(html);
        Elements elements = document.select("a");
        int len = elements.size()-2;
        for(int i = 0 ;i<len;i++){
            String s = elements.get(i).text();
            s= s.replaceAll( " ","");
            String[] ss = s.split("作者:");
            Production po = new Production();
            po.setId(i+1);
            po.setName(ss[0]);
            po.setAuthor(ss[1]);
            po.setChapterDirectoryHref(elements.get(i).attr("href"));
            ret.add(po);
        }
        return ret;
    }

    public List<Production> queryAllLikeProductions(String query){
        Reptile reptile = new Reptile();
        String url = new ProductionService().getChapterDirectoryUrl(query);
        String html = reptile.fromURLGetHtml(url).toString();
        return getALLProductions(html);
    }

上图证实猜想正确。

根据作品url获取各章节url

public List<Chapter>getAllChapter(String url){

        ArrayList<Chapter> allChapters = new ArrayList<Chapter>();
        try {
            Document document =Jsoup.connect(url).timeout(3000).get();
            Elements elements = document.select(".listmain dl dd a");
            int len = elements.size();
            for(int i = 0;i<len;i++){
                String name = elements.get(i).text();
                String[] ss = name.split(" ");
                String id = "";
                if(ss.length>1){
                    name = ss[ss.length-1];
                    id = ss[0];
                }
                String href = "https://www.biqugex.com"+elements.get(i).attr("href");
                Chapter chapter = new Chapter();
                chapter.setName(name);
                chapter.setUrl(href);
                chapter.setId(id);
                allChapters.add(chapter);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return allChapters;
    }

完成章节获取


爬取章节内容 也就是小说内容。

    public Fiction getFictionByUrl(String url){
        Fiction ret = new Fiction();
        try {
            Document document = Jsoup.connect(url).get();
            Element element = document.select("#content").get(0);
            String content = element.text();
            content.replaceAll("(","")
                    .replaceAll(")","")
                    .replaceAll("\\(","")
                    .replaceAll("\\)","")
                    .replaceAll("《","")
                    .replaceAll("》","");
            StringOperation so = new StringOperation();
            content = so.deleteRedundantSpace(content);
            String[] ss = content.split(" ");
            ret.setName(ss[0]);
            ArrayList<String> alls = new ArrayList<String>();
            for(int i =1;i<ss.length;i++){
                alls.add(ss[i]);
            }
            ret.setSections(alls);

        } catch (IOException e) {
            e.printStackTrace();
        }
        return ret;
    }



最终类ProductionService

package com.ym.reptile.service;

import com.ym.reptile.bean.Chapter;
import com.ym.reptile.bean.Fiction;
import com.ym.reptile.bean.Production;
import com.ym.reptile.util.Reptile;
import com.ym.reptile.util.StringOperation;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.w3c.dom.stylesheets.DocumentStyle;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.List;

public class ProductionService {


    private String s = "9157106854577873494";

    public void setSimpleCode(String s){
        this.s =s;
    }

    private String getQueryCode(String query){
        String ret ="";
        try {
            return new StringOperation().toURLCodeGBK(query);
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return ret;
    }

    public String getChapterDirectoryUrl(String query){
       return "https://so.biqusoso.com/s.php?ie=gbk&siteid=biqugex.com&s="+s+"&q="+getQueryCode(query);
    }


    public List<Production> getALLProductions(String html){
        ArrayList<Production> ret = new ArrayList<Production>();
        Document document = Jsoup.parse(html);
        Elements elements = document.select("a");
        int len = elements.size()-2;
        for(int i = 0 ;i<len;i++){
            String s = elements.get(i).text();
            s= s.replaceAll( " ","");
            String[] ss = s.split("作者:");
            Production po = new Production();
            po.setId(i+1);
            po.setName(ss[0]);
            po.setAuthor(ss[1]);
            po.setChapterDirectoryHref(elements.get(i).attr("href"));
            ret.add(po);
        }
        return ret;
    }

    public List<Production> queryAllLikeProductions(String query){
        Reptile reptile = new Reptile();
        String url = new ProductionService().getChapterDirectoryUrl(query);
        String html = reptile.fromURLGetHtml(url).toString();
        return getALLProductions(html);
    }

    public List<Chapter>getAllChapter(String url){

        ArrayList<Chapter> allChapters = new ArrayList<Chapter>();
        try {
            Document document =Jsoup.connect(url).timeout(3000).get();
            Elements elements = document.select(".listmain dl dd a");
            int len = elements.size();
            for(int i = 0;i<len;i++){
                String name = elements.get(i).text();
                String[] ss = name.split(" ");
                String id = "";
                if(ss.length>1){
                    name = ss[ss.length-1];
                    id = ss[0];
                }
                String href = "https://www.biqugex.com"+elements.get(i).attr("href");
                Chapter chapter = new Chapter();
                chapter.setName(name);
                chapter.setUrl(href);
                chapter.setId(id);
                allChapters.add(chapter);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return allChapters;
    }


    public Fiction getFictionByUrl(String url){
        Fiction ret = new Fiction();
        try {
            Document document = Jsoup.connect(url).get();
            Element element = document.select("#content").get(0);
            String content = element.text();
            content.replaceAll("(","")
                    .replaceAll(")","")
                    .replaceAll("\\(","")
                    .replaceAll("\\)","")
                    .replaceAll("《","")
                    .replaceAll("》","");
            StringOperation so = new StringOperation();
            content = so.deleteRedundantSpace(content);
            String[] ss = content.split(" ");
            ret.setName(ss[0]);
            ArrayList<String> alls = new ArrayList<String>();
            for(int i =1;i<ss.length;i++){
                alls.add(ss[i]);
            }
            ret.setSections(alls);

        } catch (IOException e) {
            e.printStackTrace();
        }
        return ret;
    }








}

深圳SEO优化公司泰安网站推广报价桐城百度网站优化多少钱德阳百度爱采购推荐太原优化哪家好防城港seo网站推广推荐文山网站制作银川百度网站优化价格那曲SEO按天扣费烟台seo网站推广公司阜阳品牌网站设计哪家好乌海优秀网站设计多少钱铜川建设网站推荐大理阿里店铺托管公司汕尾模板推广价格南昌阿里店铺运营价格赣州网站优化按天收费公司青岛网络营销推荐朝阳网络营销推荐海西关键词排名价格长沙企业网站设计推荐甘孜设计公司网站推荐常州阿里店铺托管阳江营销型网站建设报价开封网站开发哪家好黄山百度竞价公司凉山网页设计哪家好丹竹头设计公司网站多少钱赤峰网站推广报价和田模板推广报价甘孜网站搜索优化报价歼20紧急升空逼退外机英媒称团队夜以继日筹划王妃复出草木蔓发 春山在望成都发生巨响 当地回应60岁老人炒菠菜未焯水致肾病恶化男子涉嫌走私被判11年却一天牢没坐劳斯莱斯右转逼停直行车网传落水者说“没让你救”系谣言广东通报13岁男孩性侵女童不予立案贵州小伙回应在美国卖三蹦子火了淀粉肠小王子日销售额涨超10倍有个姐真把千机伞做出来了近3万元金手镯仅含足金十克呼北高速交通事故已致14人死亡杨洋拄拐现身医院国产伟哥去年销售近13亿男子给前妻转账 现任妻子起诉要回新基金只募集到26元还是员工自购男孩疑遭霸凌 家长讨说法被踢出群充个话费竟沦为间接洗钱工具新的一天从800个哈欠开始单亲妈妈陷入热恋 14岁儿子报警#春分立蛋大挑战#中国投资客涌入日本东京买房两大学生合买彩票中奖一人不认账新加坡主帅:唯一目标击败中国队月嫂回应掌掴婴儿是在赶虫子19岁小伙救下5人后溺亡 多方发声清明节放假3天调休1天张家界的山上“长”满了韩国人?开封王婆为何火了主播靠辱骂母亲走红被批捕封号代拍被何赛飞拿着魔杖追着打阿根廷将发行1万与2万面值的纸币库克现身上海为江西彩礼“减负”的“试婚人”因自嘲式简历走红的教授更新简介殡仪馆花卉高于市场价3倍还重复用网友称在豆瓣酱里吃出老鼠头315晚会后胖东来又人满为患了网友建议重庆地铁不准乘客携带菜筐特朗普谈“凯特王妃P图照”罗斯否认插足凯特王妃婚姻青海通报栏杆断裂小学生跌落住进ICU恒大被罚41.75亿到底怎么缴湖南一县政协主席疑涉刑案被控制茶百道就改标签日期致歉王树国3次鞠躬告别西交大师生张立群任西安交通大学校长杨倩无缘巴黎奥运

深圳SEO优化公司 XML地图 TXT地图 虚拟主机 SEO 网站制作 网站优化