最近闲来无事,拿来练练手。
注:
由于网站可能会变动,本代码不保证后面一直都能用,仅讲述抓取的思路;
个人纯属研究使用,请不要应用于商业目的;
使用语言:Python
版本:3.4.3
依赖:、(可以使用pip install进行安装)
代码也比较简单,直接贴上来:
HttpClient.py
# -*- coding: utf-8 -*-import requestsdef make_request(url): print('make_request: ', url) r = requests.get(url, timeout=(30, 90))# if r.status_code == 200: print('content-type: ', r.headers['content-type']) print('encoding: ', r.encoding) print('apparent_encoding: ', r.apparent_encoding) return r
Kanunu8.py
# -*- coding: utf-8 -*-import osimport sysimport reimport encodings#为了引用与父目录同级文件夹中的方法sys.path.append("..")# 解决gb2312乱码问题encodings.aliases.aliases['gb2312'] = 'gb18030'from bs4 import BeautifulSoupfrom _pyio import openfrom util import *book_url = ''book_name = ''#屏蔽掉作者的链接writer_link_pattern = re.compile(r'.*/writer/\d+\.html')#由于我用的是Windows平台,文件名中不能包含下列非法字符,需要过滤window_illegal_file_name_pattern = re.compile(r'[\\|/|:|\*|\?|"|<|>|\|]') def find_tbody(tag): if tag.name == 'tbody': if tag.find('tbody') is None and tag.find('strong').string == '正文': return True elif '发布时间' in tag.get_text(): return True return Falsedef strong_with_no_href(tag): return tag.name == 'strong' and tag.a is None and tag.font is not None def find_title(tag): if tag.h1 is not None: return tag.h1.font.string elif tag.h2 is not None: return tag.h2.font.string else: return tag.find(strong_with_no_href).font.stringdef make_soup(html): # , from_encoding='gb18030' soup = BeautifulSoup(html, "html.parser") print('original_encoding: ', soup.original_encoding, ', declared_html_encoding: ', soup.declared_html_encoding, ', from_encoding: ', soup.from_encoding) return soupdef get_legal_window_file_name(name): if name is None: return 'unknown' return window_illegal_file_name_pattern.sub('', name) if __name__ == '__main__': book_url = input('请输入电子书URL:') # 按任意键继续# if input('请按任意键开始抓取...'):# pass #获取Html内容 request = HttpClient.make_request(book_url) html = request.content soup = make_soup(html) # 爬取书名 book_name = soup.find('title').string path = './' + get_legal_window_file_name(book_name) + '.txt' links = [] #提取所有章节的链接 for tmp in soup.find_all('tbody'): if len(tmp.find_all('tr')) > 1 : all_link = tmp.find_all('a') if not all_link is None: links.extend(all_link) if book_url.endswith('.html'): parent_url = book_url[0:book_url.rindex('/') + 1] else: parent_url = book_url with open(path, 'w', encoding="utf-8") as f: for link in links: # 作家链接,忽略 if not writer_link_pattern.match(link['href']) is None: continue print('\n', link.string) url = parent_url + link['href'] print(url) response = HttpClient.make_request(url) chapter_soup = make_soup(response.content) chapter_name = find_title(chapter_soup) # 章节标题 f.write('\n\n') f.write(chapter_name) f.write('\n\n') # 章节内容 f.write(chapter_soup.find('p').get_text().replace('', ''))# for p in chapter_soup.find('p').contents:# if p == '':# f.write('\n')# elif p is NavigableString:# f.write(p)# elif p is Tag:# f.write(p.string) f.flush() print('电子书已成功保存: ', path)
遇到的问题:
不同的书(甚至章节)标题内容、字体(h1,h2...)、标签结构都不同;
编码问题,抓下来是乱码,具体原因请;
应该是为了增加爬取的难度吧,不过只能针对遇到的问题进行分析、解决;