这是 UD_make_mdx.py
文件 ,根据wordlists目录下的词条列表去抓取解释和例句 。 不清楚是否是API的问题, 我只是猜测 但是现在遇到了问题 提示我
出错:A , 第 1 行
出错:M , 第 1 行
出错:Z , 第 1 行
出错:T , 第 1 行
^CProcess SpawnPoolWorker-27:
Process SpawnPoolWorker-23:
Process SpawnPoolWorker-17:
Process SpawnPoolWorker-24:
Process SpawnPoolWorker-19:
Process SpawnPoolWorker-25:
Process SpawnPoolWorker-21:
Process SpawnPoolWorker-22:
Process SpawnPoolWorker-15:
Process SpawnPoolWorker-14:
Process SpawnPoolWorker-18:
Process SpawnPoolWorker-20:
Traceback (most recent call last):
File "/Users/vivian/Downloads/UrbanDictionary/UD_make_mdx.py", line 512, in <module>
Process SpawnPoolWorker-26:
Process SpawnPoolWorker-13:
Process SpawnPoolWorker-10:
Process SpawnPoolWorker-9:
Process SpawnPoolWorker-5:
Process SpawnPoolWorker-12:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-11:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-8:
Process SpawnPoolWorker-6:
Process SpawnPoolWorker-7:
Process SpawnPoolWorker-1:
UD_m(None, None)
File "/Users/vivian/Downloads/UrbanDictionary/UD_make_mdx.py", line 383, in UD_m
Process SpawnPoolWorker-16:
pool.map(letterDownload, letters)
File "/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 364, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 765, in get
self.wait(timeout)
File "/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 762, in wait
self._event.wait(timeout)
File "/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/threading.py", line 574, in wait
signaled = self._cond.wait(timeout)
File "/usr/local/Cellar/python@3.9/3.9.7/Frameworks/Python.framework/Versions/3.9/lib/python3.9/threading.py", line 312, in wait
waiter.acquire()
KeyboardInterrupt
下面是原文件
# -*- coding=UTF-8 -*-
# Urban Dictionary Wordlists to MDX(Html)
# V1.2 (2015-07-15 17:10)
# 作者: firefly(dance.of.firefly@gmail.com)
#
# 作用:根据 UD_wordlists_gen.py生成的单词列表,调用API(http://api.urbandictionary.com/v0/define?term=word),
# 逐个获取解释和例句,格式化排版生成符合MDict的html格式供MDXBuilder处理。
#
# 注意:
# 1.部分词语用此API可能查询不到结果,如:‘f$&@’,'120%',可能是词条热度不够(赞同/反对票数很少)
# 2.部分的meaning为空,仅有example,如:‘IAG’,'Ya Face';甚至部分的word为空,如:http://www.urbandictionary.com/author.php?author=Thecrappymusicteecher
# 3.部分的'example'为空,仅有'meaning',如:‘a, an’
#
# 本程序约定的抓取规则:
# 1.词条必须有word 和 meaning ,example 允许没有;
# 2.用API获取不到的冷词暂时不考虑(其实可以用网页抓取)。
import sys
import re
import os
import time
import requests
import json
from os import path
from multiprocessing import Pool
import urllib.request, urllib.parse, urllib.error
################################################################################
# globals
#
# chars is a list of all possible chars that can be chosen
# char_dict is a dictionary of all possible chars that can be chosen
################################################################################
# 65 for 'A', 90 for 'Z', 97 for 'a', 122 for 'z'
chars = list(map(chr, range(65,91))) + list(map(chr, range(97,123))) + ["*"]
char_dict = dict(zip(chars,chars))
WL_folder = 'Downloads/UrbanDictionary/WordLists' # 单词列表文件夹
MDX_folder = 'Downloads/UrbanDictionary/MDXs' # 生成的MDX制作源文件
FAILED_folder = 'Downloads/UrbanDictionary/Failed' # 在线抓取信息失败的文件夹:应对网络中断
PROGRESS_folder = 'Downloads/UrbanDictionary/Progress' # 处理进度的文件夹(当前处理到的行号):应对断电
ABANDON_folder = 'Downloads/UrbanDictionary/Abandon' # 存放由于所有解释和例句都不符合要求而舍弃掉的词语
################################################################################
# 将词语中的特殊字符进行编码,便于进行网址请求
#
# 输入参数:
# term: 编码前的词语
# 输出参数:
# term: 编码后的词语
################################################################################
#如果需要在URL中用到,需要将这些特殊字符换成相应的十六进制的值
# 字符 URL编码值
# 空格 %20
# " %22
# # %23
# % %25
# & %26
# ( %28
# ) %29
# + %2B
# , %2C
# / %2F
# : %3A
# ; %3B
# < %3C
# = %3D
# > %3E
# ? %3F
# @ %40
# \ %5C
# | %7C
def termEncode(term) :
if term :
term = term.replace('%', '%25')
term = term.replace(' ', '%20')
term = term.replace('"', '%22')
term = term.replace('#', '%23')
term = term.replace('&', '%26')
term = term.replace('(', '%28')
term = term.replace(')', '%29')
term = term.replace('+', '%2B')
term = term.replace(',', '%2C')
term = term.replace('/', '%2F')
term = term.replace(':', '%3A')
term = term.replace(';', '%3B')
term = term.replace('<', '%3C')
term = term.replace('=', '%3D')
term = term.replace('>', '%3E')
term = term.replace('?', '%3F')
term = term.replace('@', '%40')
term = term.replace('\\', '%5C') # for '\' escape
term = term.replace('|', '%7C')
return term
################################################################################
# 联网获取单个词语的所有解释信息(JSON格式)
#
# 输入参数:
# term:单词
# 输出参数:
# js: 解释的JSON对象
# failed: 是否下载失败
################################################################################
def fetchTerm(term) :
url = "http://api.urbandictionary.com/v0/define?term="+termEncode(term)
headers = {
'Connection': 'Keep-Alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36'
}
times =10 #设置请求失败后重试的次数
for i in range(0, times):
try:
response = requests.get(url, headers=headers, timeout = (5+i*10)) #每次重试的超时时间依次递增
if response.status_code == 200 :
# if i >0 :
# print('\n第 '+ str(i) +'/' + str(times) +' 次重试成功!')
js = json.loads(response.text)
if js['result_type'] == "no_results" :
print(term+'\n\t'+str(js))
return None,False
else :
return js,False
break
else:
raise IOError("网页请求失败!")
except Exception as e:
# print(e)
if i == (times - 1) :
# print('下载页面已达到最大重试次数!!!')
return None,True
# else:
# print('准备第 '+ str(i+1) +'/' + str(times) +' 次重试')
################################################################################
# 格式化传入的词语信息,生成可被 MdxBuilder制作的html格式字符串
#
# 输入参数:
# term:单词; index:解释编号(从‘1’开始); word:解释单词(可能跟主单词形式略有差异);
# meaning: 解释; example: 例句; end: 该单词所有解释是否结束
# 输出参数:
# result: 本次已格式化的字符串
################################################################################
def formatTermSnippet(term,index,word,meaning,example,end=False) :
result =''
# ---head---
# snaphoe
# <link rel="stylesheet" type="text/css" href="test.css" />
# <a name="page_top"></a>
# <div class="UD_word_header">
# <span class="UD_word_header_word">snaphoe</span>
# </div>
if (index == '1') and word and meaning :
CSS = 'Downloads/UrbanDictionary/data/UD.css'
p_head = term +'\n'
p_style = '<link rel="stylesheet" type="text/css" href="'+CSS+'"/>'
p_top = '<a name="page_top"></a>'
p_headword = '<div class="UD_word_header"><span class="UD_word_header_word">'+term+'</span></div>'
result = p_head + p_style + p_top + p_headword
# ---explanation box---
# <div class="UD_explanation_box">
# <span class="UD_item_number">
# <a href="entry://#page_top">1</a>
# </span>
# <span class="UD_explanation_label">snaphoe</span>
# <span class="UD_explanation_content">A hoe who loves to snapchat</span>
# </div>
#×××部分的meaning为空,仅有example,如:‘IAG’,'Ya Face';甚至部分的word为空,如:http://www.urbandictionary.com/author.php?author=Thecrappymusicteecher
if word and meaning :
# 预处理 meaning\example 中的'\r\n' 和 ‘[]’
meaning = meaning.replace('\r','\n')
meaning = meaning.replace('\n\n','\n')
meaning = meaning.replace('\n','<br />')
#正则替换[]
pattern_m = re.compile(r'\[([^\]]+)\]')
meaning = pattern_m.sub(r'<a href="entry://\1">\1</a>', meaning)
p_itemnumber ='<div class="UD_explanation_box"><span class="UD_item_number"><a href="entry://#page_top">'+index+'</a></span>'
p_itemword = '<span class="UD_explanation_label">'+word+'</span>'
p_itemmeaning = '<span class="UD_explanation_content">'+meaning+'</span></div>'
result += p_itemnumber + p_itemword + p_itemmeaning
# ---example box---
# <div class="UD_example_box">
# <ul>
# <li>
# <p>
# <span class="UD_example_sentence">Nate is such a snaphoe, look at this dic-pic he sent.</span>
# </p>
# </li>
# </ul>
# </div>
# 部分的'example'为空,仅有'meaning',如:‘a, an’
if word and meaning and example :
example = example.replace('\r', '\n')
example = example.replace('\n\n','\n')
example = example.replace('\n','<br />')
#正则替换[]
pattern_e = re.compile(r'\[([^\]]+)\]')
example = pattern_e.sub(r'<a href="entry://\1">\1</a>', example)
p_itemexample = '<div class="UD_example_box"><ul><li><p><span class="UD_example_sentence">'+example+'</span></p></li></ul></div>'
result += p_itemexample
if end:
result += '\n</>'
return result
################################################################################
# 格式化传入的词语信息,生成可被MdxBuilder制作的html格式字符串
#
# ---注意:解释列表需要按解释准确度进行排序!(此处list已经是有序的,不用排)
# 输入参数:
# term:单词; index:解释编号(从‘1’开始); word:解释单词(可能跟主单词形式略有差异);
# meaning: 解释; example: 例句; end: 该单词所有解释是否结束
# 输出参数:
# result: 已格式化的字符串
################################################################################
def getFormatedTerm(term,js) :
# ---an JSON output example:---
# {
# "tags": [],
# "result_type": "exact",
# "list": [
# {
# "defid": 7868595,
# "word": "snaphoe",
# "author": "sheeeev",
# "permalink": "http://snaphoe.urbanup.com/7868595",
# "definition": "A hoe who loves to snapchat",
# "example": "Nate is such a snaphoe, look at this dic-pic he sent.",
# "thumbs_up": 8114,
# "thumbs_down": 6966,
# "current_vote": ""
# },
# {
# "defid": 8319213,
# "word": "Snaphoe",
# "author": "Lwellington",
# "permalink": "http://snaphoe.urbanup.com/8319213",
# "definition": "Pay per view stripper who uses snapchats [snapcash] feature to make money.",
# "example": "Kris: dam gurl how you make your money\r\nJenny: I'm a snaphoe",
# "thumbs_up": 4012,
# "thumbs_down": 3312,
# "current_vote": ""
# }
# ],
# "sounds": [
# "http://wav.urbandictionary.com/snaphoe-23075.wav",
# "http://wav.urbandictionary.com/snaphoe-23076.wav",
# "http://wav.urbandictionary.com/snaphoe-23081.wav",
# "http://wav.urbandictionary.com/snaphoe-35219.wav"
# ]
# }
result = ''
end = False
if js :
count = len(js['list'])
j = 0
for i in range(0,count):
word = js['list'][i]['word']
meaning = js['list'][i]['definition']
example = js['list'][i]['example']
if (i==(count-1)) and ((i!=0) or (word and meaning)) :
end = True
# 必须含有word和meaning
if word and meaning :
j += 1
else :
print(term + ' 的第 '+str(i+1)+' 个解释存在问题:' +word+' --- ' +meaning)
result += formatTermSnippet(term,str(j),word,meaning,example,end)
# print(result)
if result == '' :
print(term + ' 的所有解释都不符合要求!原始结果如下:\n'+str(js))
return None
else :
return result
################################################################################
# getLetters returns list
#
# returns a list of corresponding letters and '*' for numbers.
# if the first and last letter are available, and inclusive list is returned
# if the last letter is None, a list of one item is returned (first letter)
################################################################################
def getLetters(firstLetter, lastLetter):
star = '*'
start = 65 # 65 for 'A"
end = 91 # 90 for 'Z'
letters = []
if firstLetter and lastLetter:
firstLetter = firstLetter.upper()
lastLetter = lastLetter.upper()
if lastLetter == star:
letters = list(map(chr, range(ord(firstLetter),91))) + ["*"]
else:
letters = list(map(chr, range(ord(firstLetter),ord(lastLetter)+1)))
elif firstLetter:
firstLetter = firstLetter.upper()
if firstLetter == star:
letters = [star]
else:
letters = [firstLetter]
else:
letters = list(map(chr, range(65,91))) + ["*"]
return letters
################################################################################
# 下载单个字母下的所有词语的解释和例句(若上次下载遇到过断电,继续上次的词条进度位置)
################################################################################
def letterDownload(letter):
if letter == "*" : #操作系统不允许文件名包含*号
fname = "#.txt"
else :
fname = letter + ".txt"
global WL_folder, MDX_folder, FAILED_folder, PROGRESS_folder, ABANDON_folder
wl_fpath = ''.join([os.getcwd(), path.sep, WL_folder, path.sep, fname])
try :
wl_file = open(wl_fpath, 'r', encoding='utf-8') # open file in read-only mode
except : # 文件不存在则不报错
return
mdx_fpath = ''.join([os.getcwd(), path.sep, MDX_folder, path.sep, fname])
mdx_file = open(mdx_fpath, 'a', encoding='utf-8') # open or create file in append mode
failed_fpath = ''.join([os.getcwd(), path.sep, FAILED_folder, path.sep, fname])
progress_fpath = ''.join([os.getcwd(), path.sep, PROGRESS_folder, path.sep, fname])
abandon_fpath = ''.join([os.getcwd(), path.sep, ABANDON_folder, path.sep, fname])
# 读取词条进度信息
line_index = None
if os.path.exists(progress_fpath) :
with open(progress_fpath, 'r', encoding='utf-8') as prgs_file :
for l in prgs_file :
l = l.strip() # 默认首位删除空白符(包括'\n', '\r', '\t', ' ')
try:
line_index = int(l)
except : #有可能写入进度信息时刚好出现异常(断电等),导致进度信息写入失败
line_index = None
print(progress_fpath + ' 中有错误,请手工检查后再试!')
return
break
i = 0
for line in wl_file :
i += 1
# ‘进度继续功能’:有词条进度数据,则继续进度;无进度数据,从头开始
if ( line_index and (line_index<i) ) or (not line_index) :
line = line.strip() # 默认首尾删除空白符(包括'\n', '\r', '\t', ' ')
if line :
# print('开始处理 '+letter +'下的第 '+str(i) +'行')
js,failed = fetchTerm(line)
if failed :
failed_file = open(failed_fpath,'a', encoding='utf-8') # open or create file in append mode
failed_file.write(line + '\n')
failed_file.close()
print('出错:' + letter + ' , 第 ' +str(i) +' 行')
else :
if js :
result = getFormatedTerm(line,js)
if result :
mdx_file.write(result + '\n')
else : # 词条解释例句全部不和规则
abandon_file = open(abandon_fpath,'a', encoding='utf-8') # open or create file in append mode
abandon_file.write(line + '\n')
abandon_file.close()
else : # 词条无解释和例句
abandon_file = open(abandon_fpath,'a', encoding='utf-8') # open or create file in append mode
abandon_file.write(line + '\n')
abandon_file.close()
print(letter +' 下载完成!')
################################################################################
# 多进程下载词语解释例句并生成MDX源文件(html)
#
# 每个进程负责下载一个字母下的所有词语
################################################################################
def UD_m(firstLetter, lastLetter):
letters = getLetters(firstLetter, lastLetter)
numOfProcs = len(letters) #下载进程数
pool = Pool(numOfProcs)
pool.map(letterDownload, letters)
################################################################################
# checkArg returns NoneType
#
# checks the argument in the list argv
# if the argument in question is not a char or not a known letter, python exits
################################################################################
def checkArg(argv, i):
if ( len(argv[i]) != 1 ) or ( not char_dict.has_key(argv[i]) ):
print( "Please specify a letter or '*'." )
print( "Argument", str(i), "is not a letter or '*'." )
print( "Argument supplied is too long: ", argv[i] )
sys.exit()
################################################################################
# dump success information
#
# tries to write success information into file
# Success info contains: time, success message
################################################################################
def dumpInfo(info):
fname='MDX_success.txt'
fpath = ''.join([os.getcwd(), path.sep, fname])
fw = open(fpath, 'a', encoding='utf-8') # open or create file
try:
fw.write(info)
finally:
fw.close()
################################################################################
# from: http://www.ykuaile.net/archives/3374.html
# 利用139邮箱发短信功能发送免费短信,以达到即使报警信息。
# 短信收费说明:
# 本计费周期赠 30 条,超出按 0.1元/条 计费。
# 向联通、电信用户发短信,与本地资费相同,不计入赠送条数。
# 一次可发给10个号码,每天限发250条,每月限发2500条。
################################################################################
def send_sms(cmcc_tel,cmcc_passwd,send_phone,content):
if not cmcc_tel.strip() or not cmcc_passwd.strip() or not send_phone.strip() or not content.strip():
return 'Error: Parameter error'
if len(content) >= 70:
return 'Error: Exceeded the character limit'
#登录139邮箱
url= "https://wapmail.10086.cn/index.htm"
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q:0.9,image/webp,*/*;q:0.8',
'Accept-Encoding':'gzip,deflate,sdch',
'Host':'wapmail.10086.cn',
'Referer':'http://wapmail.10086.cn/',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0'
}
data = {
'ur' : cmcc_tel,
'pw': cmcc_passwd
}
cmcc_session = requests.Session()
try:
cmcc_res = cmcc_session.post(url,headers=headers,data=data)
user_sid = cmcc_res.url.split('&')
user_vn = user_sid[2].replace('vn=','')
user_sid = user_sid[0].split('=')[1]
except:
return 'Error: Login Connection Failed'
if cmcc_res.url == url:
return 'Error: login Failed'
#发送短信
sms_url = "http://m.mail.10086.cn/ws12/w3/w3smsend"
sms_hearder = {
'Accept':'text/html,application/xhtml+xml,application/xml;q:0.9,image/webp,*/*;q:0.8',
'Accept-Encoding':'gzip,deflate,sdch',
'Host':'m.mail.10086.cn',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer':'http://m.mail.10086.cn/bv12/sendsms.html?&sid=%s&vn=%s&vid=&cmd=40' % (user_sid,user_vn),
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0'
}
sms_data = {
'sid' : user_sid,
'vn' : user_vn,
'cmd' : '2',
'content' : content,
'reciever' : send_phone
}
try:
sms_send = cmcc_session.post(sms_url,headers=sms_hearder,data=sms_data)
except:
return 'Error: Send Connection Failed'
sms_result = urllib.parse.unquote(sms_send.text)
sms_result = eval(sms_result.replace('null','"null"').encode('utf-8'))
#登出139邮箱
logout_url = "http://m.mail.10086.cn/wp12/w3/logout"
logout_data = {
'sid' : user_sid,
'vn' : user_vn
}
logout_hearder = {
'Accept':'text/html,application/xhtml+xml,application/xml;q:0.9,image/webp,*/*;q:0.8',
'Accept-Encoding':'gzip,deflate,sdch',
'Host':'m.mail.10086.cn',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Referer':'http://m.mail.10086.cn/bv12/home.html?&sid=%s&vn=%s' % (user_sid,user_vn),
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0'
}
logout_send = cmcc_session.post(logout_url,headers=logout_hearder,data=logout_data)
#返回代码
if str(sms_result['result']['eroerCode']) == '0':
return 'Sent:',send_phone,'Success'
else:
return 'Sent:',send_phone,'Failed'
if __name__ == "__main__":
argc = len(sys.argv)
start = time.time()
ISOTIMEFORMAT = '%Y-%m-%d %X'
start_time = time.strftime( ISOTIMEFORMAT, time.localtime( start ) )
dumpInfo('------------------------\n' + start_time +' 开始处理>>>\n')
if argc == 1:
# UD_m('N', None)
UD_m(None, None)
elif argc == 2:
checkArg(sys.argv, 1)
UD_m(sys.argv[1], None)
elif argc == 3:
checkArg(sys.argv, 1)
checkArg(sys.argv, 2)
UD_m(sys.argv[1], sys.argv[2])
end = time.time()
end_time = time.strftime( ISOTIMEFORMAT, time.localtime( end ) )
waste_time = str(end - start)
dumpInfo(end_time +' <<<结束处理\n'+'耗时:'+waste_time +'秒!\n')
print("耗时: %s 秒!" % (waste_time))
# send = send_sms('移动手机号','登录密码','接收短信号码','UrbanDictionary 全部词条OK!:-)')
# print(send)