R语言--爬取小说-战略级天使

目的

小说离线阅读,保存。练习R语言。示例网站:https://www.-----.com/yuedu/15111/

实例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
library("stringr")
library("rvest")
library('RCurl')
library('curl')
library('downloader')
library('RSelenium')
setwd('H:\\R\\R脚本\\爬虫')
url <- "https://www.-----.com/yuedu/15111/"
htmlpage <- read_html(url)
xs_chapter <- htmlpage %>% html_nodes('div#wrapper div.box_con div#list dl dd a') %>% html_text()
head(xs_chapter)#观察数据形式
xs_link <- htmlpage %>% html_nodes('div#wrapper div.box_con div#list dl dd a') %>% html_attrs()
head(xs_link)#观察数据形式
xs_link <- paste("https://www.-----.com",unlist(xs_link),sep = '')#构建章节链接
xs_content <- cbind(xs_chapter[-(1:10)],xs_link[-(1:10)])#合并目录链接

#尝试获取小说内容,并提取。
ms_f1 <- read_html(xs_content[1,2], encoding = "GB18030") %>% html_nodes('div#content') %>% html_text()
ms_f2 <- read_html(xs_content[2,2], encoding = "GB18030") %>% html_nodes('div#content') %>% html_text()
ms_f <- c(xs_content[1,1],ms_f1,xs_content[2,1],ms_f2)
write(gsub("\u00A0"," ", ms_f),file = "1.txt")

#自定义下载函数
xs_down <- function(x){
xs <- c()
for (i in 1:length(x[,1])) {
xm <- read_html(x[i,2], encoding = "GB18030") %>% html_nodes('div#content') %>% html_text()
xs <- c(xs, x[i,1], xm)
}
xs
}
#导出小说
write(gsub("\u00A0"," ", xs_down(xs_content)),file = "战略级天使.txt")