最近想尝试自己爬虫词典 ,用来练手。 在隔壁网站上发现 urbandictionary.com 网站爬取的源码
原帖地址:[2016.07.23]Urban Dictionary美国俚语词典[开源] - MDict 词库资源区 - MDict Dictionaries - 掌上百科 - PDAWIKI - Powered by Discuz!
源代码我放在这里,有兴趣研究自己爬网站的朋友可以一起学习交流
链接:阿里云盘分享
我在执行 UD_wordlists_gen.py 文件时 , 给出报错信息 有点无法理解 。不知道是什么意思 还请大神解惑
/usr/local/bin/Python3 /Users/vivian/Downloads/UrbanDictionary/UD_wordlists_gen.py
❯ /usr/local/bin/Python3 /Users/vivian/Downloads/UrbanDictionary/UD_wordlists_gen.py
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/local/Cellar/[email protected]/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/usr/local/Cellar/[email protected]/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
File "/Users/vivian/Downloads/UrbanDictionary/UD_wordlists_gen.py", line 171, in task
file = open(fpath, 'a', encoding='utf-8') # open or create file in append mode
FileNotFoundError: [Errno 2] No such file or directory: '/Users/vivian/WordLists/B.txt'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/vivian/Downloads/UrbanDictionary/UD_wordlists_gen.py", line 243, in <module>
udwg_m(None, None)
File "/Users/vivian/Downloads/UrbanDictionary/UD_wordlists_gen.py", line 188, in udwg_m
pool.map(task, letters)
File "/usr/local/Cellar/[email protected]/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 364, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/local/Cellar/[email protected]/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 771, in get
raise self._value
FileNotFoundError: [Errno 2] No such file or directory: '/Users/vivian/WordLists/B.txt'
这是原文件
# -*- coding=UTF-8 -*-
# Urban Dictionary Wordlist Generator
# V1.2 (2015-07-20 09:02)
# 作者: firefly([email protected])
#
# 作用:分析抓取UrbanDictionary网站上的所有词汇列表,保存起来(每个字母一个文件,每行一个词汇),
# 便于后续使用UD_make_mdx.py按词汇批量抓取解释和例句。
# 环境:Python 3.4.3, BeautifulSoup 4.4, Requests 2.7
# 注: 受 https://github.com/natemccoy/udwg 启发,并在此基础上修改优化(支持多进程)。
#
# Usage
# At this point, the program is command line only, but there are plans to create a GUI in the future.
# This program is written in Python and is very easy to use, just simply type:
# $> python uswg.py
# And all words from Urban Dictionary will be printed out to STDOUT
#
# if you would like just one letter, type:
# $> python uswg.py A
# to get all the words that start with the letter 'A':
#
# if you would like a range, say 'B' through 'D', type:
# $> python uswg.py B D
#
# Finally, if you would like words that what with special characters or numbers, use the '*' character, type:
# $> python uswg.py \*
#
# Keep in mind that typing:
# $> python uswg.py A \*
# is Equivalent to typing:
# $> python uswg.py
#
# Refined by firefly to make it work like a charm! (support multiprocessing & error processing!)
import sys
import re
import os
import time
import requests
from os import path
from bs4 import BeautifulSoup
from multiprocessing import Pool
################################################################################
# globals
#
# chars is a list of all possible chars that can be chosen
# char_dict is a dictionary of all possible chars that can be chosen
################################################################################
# 65 for 'A', 90 for 'Z', 97 for 'a', 122 for 'z'
chars = list(map(chr, range(65,91))) + list(map(chr, range(97,123))) + ["*"]
char_dict = dict(zip(chars,chars))
folder = 'WordLists'
################################################################################
# dump error information
#
# tries to write error information into file
# Error info contains: when, which letter & page & exception message
################################################################################
def dumpError(letter, page, info):
fname='error.log'
fpath = ''.join([os.getcwd(), path.sep, fname])
fw = open(fpath, 'a', encoding='utf-8') # open or create file
timestamp = time.asctime( time.localtime(time.time()) )
data = '---------------\n'+ timestamp +'\nLetter: "' + letter+'",page: "' +page +'",info: "' + info +'"\n'
try:
fw.write(data)
finally:
fw.close()
################################################################################
# dump success information
#
# tries to write success information into file
# Success info contains: time, success message
################################################################################
def dumpSuccess(info):
fname='Wordlists_success.txt'
fpath = ''.join([os.getcwd(), path.sep, fname])
fw = open(fpath, 'a', encoding='utf-8') # open or create file
timestamp = time.asctime( time.localtime(time.time()) )
data = timestamp +'\n' + info +'\n'
try:
fw.write(data)
finally:
fw.close()
################################################################################
# 请求页面,分析,获取数据
#
# return page number total-counts when 'page' parameter is None(default)
# otherwise it returns None
################################################################################
def getData(file, letter, page=None):
url = "http://www.urbandictionary.com/browse.php?character="+letter
if page:
url = url + "&page=" +page
print('---正在处理第 ' + page +' 个页面: ', end = '')
else:
print('=====开始处理字符 ' + letter + ' 下的词语: ', end = '')
headers = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36'
}
times =5 #设置网页下载失败后重试的次数
for i in range(0, times):
try:
response = requests.post(url, headers=headers, timeout = 5+i*10) #每次重试的超时时间依次递增
soup = BeautifulSoup(response.text)
if soup :
if i >0 :
print('\n第 '+ str(i) +'/' + str(times) +' 次重试成功!')
break
else:
raise IOError("已下载的网页有问题!")
except Exception as e:
print(e)
if i == (times - 1) :
print('下载页面已达到最大重试次数!!!')
dumpError(letter, str(page) ,str(e)) #写入错误信息
return None
else:
print('准备第 '+ str(i+1) +'/' + str(times) +' 次重试')
# 未指定页面,获取页面总数
num =None
if page == None :
pg_indicator = [a.attrs.get('href') for a in soup.select('li a[href^=/browse.php?character=]')]
pg_count = pg_indicator[-1]
pattern = re.compile('.*&page=(\d+)', re.I)
num = pattern.match(pg_count).group(1)
print('共 ' + num + ' 个页面====='+num)
# 获取页面上的词组
lst = [a.get_text('href') for a in soup.select('li a[href^=/define.php?term=]')]
if page == None :
print('---正在处理第 1 个页面: ', end = '')
print('共 ' + str(len(lst)) + ' 个词语---'+str(len(lst)))
if len(lst)>0 :
try:
for w in lst:
w = w.strip() # 默认首尾删除空白符(包括'\n', '\r', '\t', ' ')
if w :
file.write(w + '\n')
except Exception as e:
print(e)
if num:
return int(num)
else:
return None
################################################################################
# 下载单个字母下的所有词语
################################################################################
def task(letter):
if letter == "*" : #操作系统不允许文件名包含*号
fname = "#.txt"
else :
fname = letter + ".txt"
global folder
fpath = ''.join([os.getcwd(), path.sep, folder, path.sep, fname])
if os.path.exists(fpath) :
os.remove(fpath) # already exists
file = open(fpath, 'a', encoding='utf-8') # open or create file in append mode
pageCount = getData(file, letter)
if pageCount > 1:
for p in range(2,pageCount+1):
getData(file, letter, str(p))
file.close()
################################################################################
# 多进程下载特定范围内的多个字母下的词语
#
# 每个进程负责下载一个字母下的所有词语
################################################################################
def udwg_m(firstLetter, lastLetter):
letters = getLetters(firstLetter, lastLetter)
numOfProcs = len(letters) #下载进程数
pool = Pool(numOfProcs)
pool.map(task, letters)
################################################################################
# getLetters returns list
#
# returns a list of corresponding letters and '*' for numbers.
# if the first and last letter are available, and inclusive list is returned
# if the last letter is None, a list of one item is returned (first letter)
################################################################################
def getLetters(firstLetter, lastLetter):
star = '*'
start = 65 # 65 for 'A"
end = 91 # 90 for 'Z'
letters = []
if firstLetter and lastLetter:
firstLetter = firstLetter.upper()
lastLetter = lastLetter.upper()
if lastLetter == star:
letters = list(map(chr, range(ord(firstLetter),91))) + ["*"]
else:
letters = list(map(chr, range(ord(firstLetter),ord(lastLetter)+1)))
elif firstLetter:
firstLetter = firstLetter.upper()
if firstLetter == star:
letters = [star]
else:
letters = [firstLetter]
else:
letters = list(map(chr, range(65,91))) + ["*"]
return letters
################################################################################
# checkArg returns NoneType
#
# checks the argument in the list argv
# if the argument in question is not a char or not a known letter, python exits
################################################################################
def checkArg(argv, i):
if ( len(argv[i]) != 1 ) or ( not char_dict.has_key(argv[i]) ):
print( "Please specify a letter or '*'." )
print( "Argument", str(i), "is not a letter or '*'." )
print( "Argument supplied is too long: ", argv[i] )
sys.exit()
################################################################################
# __main__
#
# checks arguments and calls udwg() with corresponding arguments
# if the arguments are faulty, the program will exit with a message
################################################################################
if __name__ == "__main__":
argc = len(sys.argv)
start = time.time()
if argc == 1:
# udwg_m('Z', '*')
udwg_m(None, None)
elif argc == 2:
checkArg(sys.argv, 1)
udwg_m(sys.argv[1], None)
elif argc == 3:
checkArg(sys.argv, 1)
checkArg(sys.argv, 2)
udwg_m(sys.argv[1], sys.argv[2])
end = time.time()
print("耗时: %s 秒!" % (end-start))
dumpSuccess("耗时: "+str(end-start) +" 秒!")
为了方便查看, 我上传了原文件 UD_wordlists_gen.py (9.8 KB)