forked from zhangyingcc/spider_python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspider.py
More file actions
123 lines (109 loc) · 3.93 KB
/
spider.py
File metadata and controls
123 lines (109 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/local/python275/bin/python2.7
# -*- coding: utf-8 -*-
'''
#=============================================================================
# FileName: main.py
# Desc:
# Author: linjay
# Email: linjayzhang326@gmail.com
# HomePage: https://github.com/Linjay
# Version: 0.0.2
# LastChange: 2013-8-13 17:12:36
# History:
#=============================================================================
'''
import re
import requests
import redis
import logging
from bs4 import BeautifulSoup
class spider():
def __init__(self, log_addr, log_format, log_lvl, ip, port, fre, keys):
self.LOG_ADDRESS = log_addr
self.LOG_FORMAT = log_format
self.LOG_LEVEL = log_lvl
self.REDIS_IP = ip
self.REDIS_PORT = port
self.REDIS_FREQUENCE = fre
self.SPIDER_KEYS = keys
def init_log(self):
logger = logging.getLogger()
handler = logging.FileHandler(self.LOG_ADDRESS)
formatter = logging.Formatter(self.LOG_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(self.LOG_LEVEL)
return logger
def init_params(self):
return (
{
'host' : 'http://bbs.byr.cn',
'url' : 'http://bbs.byr.cn/board/JobInfo',
'headers' : {
"X-Requested-With" : "XMLHttpRequest",
},
'href' : "^/article/JobInfo/\d+$",
},
{
'host' : 'http://bbs.byr.cn',
'url' : 'http://bbs.byr.cn/board/Job',
'headers' : {
"X-Requested-With" : "XMLHttpRequest",
},
'href' : "^/article/Job/\d+$",
},
{
'host' : 'http://bbs.byr.cn',
'url' : 'http://bbs.byr.cn/board/ParttimeJob',
'headers' : {
"X-Requested-With" : "XMLHttpRequest",
},
'href' : "^/article/ParttimeJob/\d+$",
},
{
'host' : 'http://www.newsmth.net',
'url' : 'http://www.newsmth.net/nForum/board/Career_Campus',
'headers' : {
"X-Requested-With" : "XMLHttpRequest",
},
'href' : "^/nForum/article/Career_Campus/\d+$",
},
)
def spider_cap(self, rs, host, url, headers, href):
try:
r = requests.get(url, headers = headers)
except Exception,ex:
print Exception,":",ex
return
#print url
frs_soup = BeautifulSoup(r.text)
frs_attrs = {
'href' : re.compile(href),
'title' : None,
'target' : None,
}
frs_res = frs_soup.findAll('a', frs_attrs)
for line in frs_res:
#去除置顶贴
if line.parent.parent.get('class') == 'top':
print "top jump"
continue
line['href'] = host + line['href']
title = line.string
if filter(lambda x: x in title, self.SPIDER_KEYS):
#print line
ars = rs.sadd('urls', line)
def capture(self):
logger = self.init_log()
logger.info('spider start!')
#print "spider start"
rs = redis.Redis(host=self.REDIS_IP, port=self.REDIS_PORT)
rs.incr('times')
if int(rs.get('times')) >= self.REDIS_FREQUENCE:
rs.flushall()
#rs.set('test', "Test Successed!")#just for test redis
params = self.init_params()
for param in params :
self.spider_cap(rs, param['host'], param['url'], param['headers'], param['href'])
logger.info("spider finish!\n")
#print "spider finish"