【征】月嫂-middletown-CT (转载) - 未名空间MITBBS历史存档

【征】月嫂-middletown-CT (转载)# NextGeneration - 我爱宝宝

o*g2010-08-19 07:08

1 楼

问一个python scrapy 的问题，和程序设计的逻辑更相关 .
用python scrapy crawl 网页，每次scrape 一个string, 再parse, 从中得到一个数字
，再输出结果。用两个函数一是 start_requests, 二是 parse. scrape 的是一个文件
，有一列string, 在start_requests 下放这个 input 文件，有一个loop, 然后在
parse 函数里parse, 输出数字到一个output 文件。
-google 1point3acres
现在的问题是，每次在start_requests 的loop 里读入两个string, 一个英文，另一个
对应的中文翻译，在parse 里parse 出想要得到的数字，再把两个数字加起来放在输出
文件的最后一列。格式如下
英文string 中文string date 相加得到的数字.1point3acres网
XXX XXX XXX XXX
在start_requests 里用 formrequest scraping，返回callback=parse 函数，再用
parse 函数解析response. 如果每次scrape 两个string, 就有两个callback=parse,
parse1, parse2 , 这样就要有两个parse函数，两个parse 里的数据似乎不能传输，也
就不能把两个数字加起来，作为输出，怎么解决这个问题？
以下有两个例子
. 牛人云集,一亩三分地
例子 1 每次scrape 一个string
. 留学申请论坛-一亩三分地
#!/usr/bin/python
# coding: utf-8
import datetime
import scrapy
import time. 围观我们@1point 3 acres
from bs4 import BeautifulSoup
class QuotesSpider(scrapy.Spider):
name="quotes"-google 1point3acres
headers = {
'Cookie': 'SUV:003C177DD8CDE00A5983B697C4178439;CXID:
10B16D348AB916252883D43F2491459E;IPLOC:US;SUID:
0AE0CDD85B68860A585434400004F97E;weixinIndexVisited:1;JSESSIONID:
aaa0tnqzUtC66c0zZHQfw;ABTEST:4|1518221833|v1;SNUID:
D9321F0AD2D6B7D5C7F18502D39584AB;sct:43;ppinf:5|1518544445|1519754045|
dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo0Om1hc2h8Y3J0OjEwOjE1MTg1NDQ0
NDV8cmVmbmljazo0Om1hc2h8dXNlcmlkOjQ0Om85dDJsdUVtc05QMnRlaUFwRm45UnNTLV9JOGNA
d2VpeGluLnNvaHUuY29tfA;pprdig:
Kelig9IfK1LADeDmIEHzS9HJCCMH0wdFwYiKxVEyGntRyJ2a1xYBhf2kxxBxuzpRJ1HDGMGty2WU
tuAJ0dWhbKtzOm-Ol_p1y-WmvV-
fqVvqTxU3W9Z1CDehvNLSQFqELiiJKXRdY4w5B4sjL8u65lTMQ62gF6GlOHnN5zsTF0g;sgid:04
-31089959-AVqDJj3ZMo0w5mfTqic6uHn8;ppmdig:
1518544446000000447a37e7df0d662376a44b14a63a9073',
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.
36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Referer": "http://weixin.sogou.com/weixin?type=2"
}
def start_requests(self,filename=None):
with open('file99.txt','r') as f:
for query in f:
query=query. visit 1point3acres for more.
self.log("%s" % query)
yield scrapy.http.FormRequest(url='http://weixin.sogou.com/weixin',. more info on 1point3acres
formdata={'type':'2',
'ie':'utf8',
'query':query,
'tsn':'1',
'ft':'',
'et':'',
# 'sst0': str(int(time.time()*1000)),.
From 1point 3acres bbs
# 'page': str(1),
'interation':'',.留学论坛-一亩-三分地
'wxid':'',
'usip':''},
headers=self.headers,method='get', dont_filter=
True,
meta = {'dont_redirect': True, "handle_httpstatus_
list" : [301, 302, 303]},
callback=self.parse)
def parse(self, response):
. From 1point 3acres bbs
filename1="quotes-111.txt"
with open(filename1,"a") as k:

soup = BeautifulSoup(response.body, 'html.parser').
more info on 1point3acres
row2= soup.find_all('input',attrs={"class":"query"})
cc_rating_text="约".decode('utf8')
dd_rating_text="条".decode('utf8') 来源一亩.三分地论坛
.
for row in soup.find_all('div',attrs={"class" : "mun"}
):
line=row.text.strip()
tag_found = line.find(cc_rating_text)
tag_found2 = line.find(dd_rating_text). visit
1point3acres for more.
. 留学申请论坛-一亩三分地
rating = line[tag_found+1:tag_found2]

date11=datetime.datetime.now().strftime ("%m/%d/%Y"
)

k.write(row2[0]["value"].encode('utf8')+"t"+str(
date11)+"t"+str(rating)+"n")

row1=0

if not soup.find_all('div',attrs={"class" : "mun"}):
soup = BeautifulSoup(response.body, 'html.parser')

for kk in soup.find_all('p',attrs={"class":"txt-
info"}):. more info on 1point3acres
row1=row1+1 来源一亩.三分地论坛.
date22=datetime.datetime.now().strftime ("%m/%d/%Y
") . visit 1point3acres for more.
k.write(row2[0]["value"].encode('utf8')+"t"+str(
date22)+"t"+str(row1)+"n"). 1point3acres

self.log("Saved file %s" % filename1)
来源一亩.三分地论坛.
例子 2 每次读入两个string , 在一个loop 里scrape, 但是后来发现结果黏在一起，
不能相加了。
#!/usr/bin/python
# coding: utf-8
.1point3acres网
import datetime
import itertools
import scrapy
import time
from bs4 import BeautifulSoup. 1point 3acres 论坛
class QuotesSpider(scrapy.Spider):
name="quotes"
headers = {
'Cookie': 'SUV:003C177DD8CDE00A5983B697C4178439;CXID:
10B16D348AB916252883D43F2491459E;IPLOC:US;SUID:
0AE0CDD85B68860A585434400004F97E;weixinIndexVisited:1;JSESSIONID:
aaa0tnqzUtC66c0zZHQfw;ABTEST:4|1518221833|v1;SNUID:
D9321F0AD2D6B7D5C7F18502D39584AB;sct:43;ppinf:5|1518544445|1519754045|
dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo0Om1hc2h8Y3J0OjEwOjE1MTg1NDQ0
NDV8cmVmbmljazo0Om1hc2h8dXNlcmlkOjQ0Om85dDJsdUVtc05QMnRlaUFwRm45UnNTLV9JOGNA
d2VpeGluLnNvaHUuY29tfA;pprdig:
Kelig9IfK1LADeDmIEHzS9HJCCMH0wdFwYiKxVEyGntRyJ2a1xYBhf2kxxBxuzpRJ1HDGMGty2WU
tuAJ0dWhbKtzOm-Ol_p1y-WmvV-
fqVvqTxU3W9Z1CDehvNLSQFqELiiJKXRdY4w5B4sjL8u65lTMQ62gF6GlOHnN5zsTF0g;sgid:04
-31089959-AVqDJj3ZMo0w5mfTqic6uHn8;ppmdig:
1518544446000000447a37e7df0d662376a44b14a63a9073',-google 1point3acres
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.
36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",. 围观我们@1point
3 acres
"Referer": "http://weixin.sogou.com/weixin?type=2"
}
来源一亩.三分地论坛.
def start_requests(self,filename=None):
with open('file66.txt','r') as f:.1point3acres网
for line1,line2 in itertools.izip_longest(*[f]*2):
query=line1
query2=line2
query3=[query,query2]
来源一亩.三分地论坛.
# self.log("%s" % query)
for item in query3:
yield scrapy.http.FormRequest(url='http://weixin.sogou.com/weixin',
formdata={'type':'2',
'ie':'utf8',. From 1point 3acres bbs
'query':item,
'tsn':'1',
'ft':'',
'et':'',
# 'sst0': str(int(time.time()*1000)),
# 'page': str(1),
'interation':'', 来源一亩.三分地论坛.
'wxid':'',
'usip':''},
headers=self.headers,method='get', dont_filter=
True,. 牛人云集,一亩三分地
meta = {'dont_redirect': True, "handle_httpstatus_
list" : [301, 302, 303]},
callback=self.parse1)
. 牛人云集,一亩三分地
def parse1(self, response):
filename1="quotes-111.txt". 留学申请论坛-一亩三分地
with open(filename1,"a") as k:

soup = BeautifulSoup(response.body, 'html.parser').
From 1point 3acres bbs
row2= soup.find_all('input',attrs={"class":"query"})
cc_rating_text="约".decode('utf8').留学论坛-一亩-三分地
dd_rating_text="条".decode('utf8')
if soup.find_all('div',attrs={"class" : "mun"}):
for row in soup.find_all('div',attrs={"class" : "mun"
}):. more info on 1point3acres
line=row.text.strip()
tag_found = line.find(cc_rating_text)
tag_found2 = line.find(dd_rating_text)
rating1 = line[tag_found+1:tag_found2]

date11=datetime.datetime.now().strftime ("%m/%d/%Y"
)

# k.write(row2[0]["value"].encode('utf8')+"t"+str(
date11)+"t"+str(rating1)+"n")

elif not soup.find_all('div',attrs={"class" : "mun"}):
soup = BeautifulSoup(response.body, 'html.parser')
.留学论坛-一亩-三分地
rating1=0.本文原创自1point3acres论坛

for kk in soup.find_all('p',attrs={"class":"txt-
info"}):
rating1=rating1+1
date11=datetime.datetime.now().strftime ("%m/%d/%Y
")

z*22010-08-19 07:08

2 楼

【以下文字转载自 Connecticut 讨论区】
发信人: zeayin82 (zea), 信区: Connecticut
标题: 【征】月嫂-middletown-CT
发信站: BBS 未名空间站 (Thu Aug 19 09:33:22 2010, 美东)
白天或者全天均可，12月中旬开始。
价格工作内容面议。
会否说中文无所谓。