Python百度云网盘搜索引擎源码及安装教程

分类栏目：服务器

免费的源码资源共享还是需要大家一起努力的！程序投稿发布联系QQ：845597791

190

发布于2019-02-13

运行环境

开始之前你需要安装

•PHP 5.3.7 +

•MySQL

•Python 2.7 ~

•xunsearch 搜索引擎

获取源码

ssh 方式:

git clone git@github.com:k1995/BaiduyunSpider.git

https 方式:

git clone https://github.com/k1995/BaiduyunSpider

或手动下载

https://github.com/k1995/BaiduyunSpider/archive/master.zip

下载完毕后，项目的目录结构大致是这样的

--- indexer/ #索引

--- spider/ #爬虫

--- sql/

--- web/ #网站

--- application/

--- config/ # 配置相关

--- config.php

--- database.php # 数据库配置

...

--- static/ # 存放静态资源，css|js|font

--- system/

--- index.php

...

开始部署

创建数据库

创建名为pan的数据库，编码设为utf-8。然后导入sql，完成表的创建。

网站部署

支持nginx，apache 服务器。

apache 需要开启 mod_rewrite 。

nginx 配置如下

location /

{

index index.php;

try_files $uri $uri/ /index.php/$uri;

}

location ~ [^/]\.php(/|$)

{

fastcgi_pass 127.0.0.1:9000;

fastcgi_index index.php;

include fastcgi.conf;

include pathinfo.conf;

}

配置文件修改

config.php 文件修改网站标题，描述等信息

database.php 修改数据库账号，密码等信息

网站是基于CodeIgniter 框架开发的，如安装，部署，或二次开发有问题，请参考官网文档

启动爬虫

进入 spider/目录，修改spider.py 中数据库信息。

如果你是第一次部署，需运行下面命令，完成做种

python spider.py --seed-user

上面其实就是抓取百度云热门分享用户的相关信息，然后从他们开始入手爬取数据

然后运行

python spider.py

此时爬虫已经开始工作了

安装xunsearch

目前使用xunsearch作为搜索引擎，后面会更换为elasticsearch。

安装过程请参考（不需要安装，PHP SDK，我已经整合到web里了）

http://xunsearch.com/doc/php/guide/start.installation

索引数据

上面我们完成了爬虫的数据抓取，网站的搭建，但还不能搜索，下面开始最后一步，索引的建立。

进入 indexer/目录，在indexer.php中将$prefix，替换为你web的根路径

require '$prefix/application/helpers/xs/lib/XS.php';

并修改数据库账号密码

然后运行

./indexer.php

# -*- coding: utf-8 -*-

import urllib2,re,argparse,json,time

import MySQLdb as mdb

import metautils,traceback,Queue,socket

import random

"""

*--------------------------------------------

* Github 仓库: https://github.com/k1995/BaiduyunSpider

* 演示：http://www.11bt.net/ *

* ----------------------------------------*/

"""

DB_HOST='127.0.0.1'

DB_PORT='3306'

DB_USER='root'

# MySQL密码

DB_PASS='123123'

# 数据库名称

DB_NAME='pan'

SPIDER_INTERVAL=1

ERR_NO=0#正常

ERR_REFUSE=1#爬虫爬取速度过快，被拒绝

ERR_EX=2#未知错误

proxy_list = [

{'http':"x.x.x.x:8080"},

{'http':"x.x.x.x:8081"},

{'http':"x.x.x.x:8082"},

{'http':"x.x.x.x:8083"},

{'http':"x.x.x.x:8084"},

{'http':"x.x.x.x:8085"},

{'http':"x.x.x.x:8086"},

{'http':"x.x.x.x:8087"},

{'http':"x.x.x.x:8088"},

{'http':"x.x.x.x:8089"}

]

def getHtml(url,ref=None,reget=5):

try:

uas = [

"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",

"Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",

"Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",

"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",

"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",

"Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",

]

proxy_ip =random.choice(proxy_list)

ua=random.choice(uas)

print proxy_ip

print ua

proxy_support = urllib2.ProxyHandler(proxy_ip)

opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler)

urllib2.install_opener(opener)

request = urllib2.Request(url)

time.sleep(5)

request.add_header('User-Agent', ua)

if ref:

request.add_header('Referer',ref)

page = urllib2.urlopen(request,timeout=30)

html = page.read()

except:

if reget>=1:

#如果getHtml失败，则再次尝试5次

print 'getHtml error,reget...%d'%(6-reget)

time.sleep(20)

return getHtml(url,ref,reget-1)

else:

print 'request url:'+url

print 'failed to fetch html'

exit()

else:

return html

class Db(object):

def __init__(self):

self.dbconn=None

self.dbcurr=None

def check_conn(self):

try:

self.dbconn.ping()

except:

return False

else:

return True

def conn(self):

self.dbconn=mdb.connect(DB_HOST, DB_USER, DB_PASS,DB_NAME, charset='utf8')

self.dbconn.autocommit(False)

self.dbcurr = self.dbconn.cursor()

def fetchone(self):

return self.dbcurr.fetchone()

def fetchall(self):

return self.dbcurr.fetchall()

def execute(self, sql, args=None,falg=False):

if not self.dbconn:

#第一次链接数据库

self.conn()

try:

if args:

rs=self.dbcurr.execute(sql,args)

else:

rs=self.dbcurr.execute(sql)

return rs

except Exception, e:

if self.check_conn():

print 'execute error'

traceback.print_exc()

else:

print 'reconnect mysql'

self.conn()

if args:

rs=self.dbcurr.execute(sql,args)

else:

rs=self.dbcurr.execute(sql)

return rs

def commit(self):

self.dbconn.commit()

def rollback(self):

self.dbconn.rollback()

def close(self):

self.dbconn.close()

self.dbcurr.close()

def last_row_id(self):

return self.dbcurr.lastrowid

class BaiduPanSpider(object):

def __init__(self):

self.db=Db()

self.files=[]

self.got_files_count=0

self.got_follow_count=0

self.while_count=0

self.spider_queue=Queue.Queue(maxsize=20)

self.status='stop'

self.errno=ERR_NO

self.file_type_t={'video':0,'image':1,'document':2,'music':3,'package':4,'software':5,'torrent':6,'other':-1}

def getShareUser(self,uk):

url='http://pan.baidu.com/share/count?uk=%d&channel=chunlei&clienttype=0&web=1'%uk

follows_json=json.loads(getHtml(url,uk))

if follows_json['errno']!=0:

if follows_json['errno']==-55:

self.errno=ERR_REFUSE

else:

self.errno=ERR_EX

return False

return {

'pubshare_cnt':follows_json['pubshare_cnt'],

'fans':follows_json['fans'],

'follow':follows_json['follow'],

'album':follows_json['follows_json']

}

def getHotUser(self):

url='http://pan.baidu.com/pcloud/friend/gethotuserlist?type=1&from=feed&start=0&limit=24&channel=chunlei&clienttype=0&web=1'

follows_json=json.loads(getHtml(url))

if follows_json['errno']!=0:

print u'failed to fetch hot users'

return False

returns=[]

count=0

for item in follows_json['hotuser_list']:

count=count+1

hot_uname=item['hot_uname'].encode('utf-8')

hot_uk=item['hot_uk']

avatar_url=item['avatar_url'].encode('utf-8')

intro=item['intro'].encode('utf-8')

follow_count=item['follow_count']

fans_count=item['fans_count']

pubshare_count=item['pubshare_count']

album_count=item['album_count']

returns.append({'hot_uname':hot_uname,'hot_uk':hot_uk,'avatar_url':avatar_url,'intro':intro,'follow_count':follow_count,'fans_count':fans_count,'pubshare_count':pubshare_count,'album_count':album_count})

if count==0:

print "got no hot users"

return False

else:

print "success to fetched hot users: %d"%count

return returns

def getFans(self,uk,start=0,limit=24):

#query_uk:用户ID

#limit:每一页最多显示数量

#start:当前页数

follows_url='http://pan.baidu.com/pcloud/friend/getfanslist?query_uk=%d&limit=%d&start=%d'%(uk,limit,start)

follows_json=json.loads(getHtml(follows_url,uk))

if follows_json['errno']!=0:

print u'failed to fetch fens'

return False

total_count=follows_json['total_count']

returns=[]

count=0

for item in follows_json['fans_list']:

count=count+1

fans_uname=item['fans_uname'].encode('utf-8')

fans_uk=item['fans_uk']

avatar_url=item['avatar_url'].encode('utf-8')

intro=item['intro'].encode('utf-8')

follow_count=item['follow_count']

fans_count=item['fans_count']

pubshare_count=item['pubshare_count']

album_count=item['album_count']

returns.append({'fans_uname':fans_uname,'fans_uk':fans_uk,'avatar_url':avatar_url,'intro':intro,'follow_count':follow_count,'fans_count':fans_count,'pubshare_count':pubshare_count,'album_count':album_count})

return (total_count,count,returns)

def getFollows(self,uk,start=0,limit=24):

follows_url='http://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=%d&limit=%d&start=%d&bdstoken=d82467db8b1f5741daf1d965d1509181&channel=chunlei&clienttype=0&web=1'%(uk,limit,start)

ref='http://pan.baidu.com/pcloud/friendpage?type=follow&uk=%d&self=1'%uk

follows_json=json.loads(getHtml(follows_url,ref))

if follows_json['errno']!=0:

print 'getFollows errno:%d'%follows_json['errno']

print 'request_url:'+follows_url

if follows_json['errno']==-55:

self.errno=ERR_REFUSE

else:

self.errno=ERR_EX

return False

total_count=follows_json['total_count']

returns=[]

count=0

if(total_count>0):

for item in follows_json['follow_list']:

count=count+1

returns.append({

'follow_uname':item['follow_uname'].encode('utf-8'),

'follow_uk':item['follow_uk'],

'avatar_url':item['avatar_url'].encode('utf-8'),

'intro':item['intro'].encode('utf-8'),

'follow_count':item['follow_count'],

'fans_count':item['fans_count'],

'pubshare_count':item['pubshare_count'],

'album_count':item['album_count']

})

return (total_count,count,returns)

def getShareLists(self,uk,start=0,limit=60):

sharelists_url='http://pan.baidu.com/pcloud/feed/getsharelist?category=0&auth_type=1&request_location=share_home&start=%d&limit=%d&query_uk=%d&channel=chunlei&clienttype=0&web=1'%(start,limit,uk)

ref='http://pan.baidu.com/share/home?uk=%d&view=share'%uk

listhtm=getHtml(sharelists_url,ref)

print(sharelists_url)

sharelists_json=json.loads(listhtm)

if(sharelists_json['errno']!=0):

print 'getShareLists errno:%d'%sharelists_json['errno']

print 'request_url:'+sharelists_url

if sharelists_json['errno']==-55:

self.errno=ERR_REFUSE

else:

self.errno=ERR_EX

return False

total_count=sharelists_json['total_count']

returns=[]

count=0

if total_count>0:

for item in sharelists_json['records']:

count=count+1

feed_type=item['feed_type']

isdir=0

size=0

md5=''

album_id=''

shorturl=''

if feed_type=='share':

if item['filecount']==1:

filelist=item['filelist']

isdir=filelist[0]['isdir']

size=filelist[0]['size']

md5=filelist[0]['md5']

else:

isdir=1

elif feed_type=='album':

album_id=item['album_id']

isdir=2

if item.has_key('shorturl'):

shorturl=item['shorturl']

if item.has_key('username'):

username=item['username'].encode('utf-8')

if feed_type=='share' or feed_type=='album':

returns.append({

'title':item['title'].encode('utf-8'),

'username':username,

'shorturl':shorturl,

'shareid':item['source_id'],

'feed_time':item['feed_time']//1000,#分享时间

'dCnt':item['dCnt'],

'isdir':isdir,

'size':size,

'md5':md5,

'uk':uk,

'feed_type':feed_type

})

return (total_count,count,returns)

def getAlbum(self,uk,start=0,limit=60):

url='http://pan.baidu.com/pcloud/album/getlist?start=%d&limit=%d&query_uk=%d&channel=chunlei&clienttype=0&web=1&bdstoken=d82467db8b1f5741daf1d965d1509181'%(start,limit,uk)

album_json=json.loads(getHtml(url,uk))

total_count=album_json['count']

returns=[]

count=0

for item in album_json['album_list']:

count=count+1

title=item['title'].encode('utf-8')

album_id=item['album_id']

create_time=item['create_time']

update_time=item['update_time']

filecount=item['filecount']

desc=item['desc']

returns.append({'title':title,'album_id':album_id,'create_time':create_time,'desc':desc,'update_time':update_time,'filecount':filecount,'uk':uk})

if count==0:

print "get nothing"

return False

else:

print "success to fetched : %d"%count

if (start+count)<total_count:

start=start+limit

returns=returns+self.getAlbum(uk,start)

return returns

def seedUsers(self):

hot_usrs=self.getHotUser()

if not hot_usrs:

return

try:

for user in hot_usrs:

time_stamp=int(time.time())

if user['pubshare_count']>0:

self.db.execute("INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\

fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(

user['hot_uk'],user['hot_uname'],user['avatar_url'],user['intro'],user['follow_count'],

user['album_count'],user['fans_count'],user['pubshare_count'],time_stamp,time_stamp,5

)

uid=self.db.last_row_id()

self.db.execute("INSERT INTO spider_list (uk,uid) VALUES(%s,%s)",(user['hot_uk'],uid))

except:

traceback.print_exc()

self.db.rollback()

else:

self.db.commit()

def startSpider(self):

if self.spider_queue.empty():

fetched_users=self.db.execute('SELECT * from spider_list ORDER BY weight DESC limit 0,20')

if fetched_users<=0:

print 'nothing to spider,spider_list is empty'

return False

self.start='start'

self.errno=ERR_NO

fetchall=self.db.fetchall()

#将数据库中取出的待爬取的分享者，加入爬取队列

for item in fetchall:

self.spider_queue.put({

'sid':item[0],

'uk':item[1],

'file_fetched':item[2],

'follow_fetched':item[3],

'follow_done':item[4],

'file_done':item[5],

'weight':item[6],

'uid':item[7]

})

self.got_follow_count=0

self.got_files_count=0

self.while_count=0

while not self.spider_queue.empty():

self.while_count+=1

share_user=self.spider_queue.get()

#爬取分享者的文件列表

if not share_user['file_done']:

print '%d now spidering file ,%d file fetched'%(share_user['uk'],share_user['file_fetched'])

rs=self.getShareLists(share_user['uk'],share_user['file_fetched'])

#print(rs)

if not rs:

print 'uk:%d error to fetch files,try again later...'%share_user['uk']

return True

total_count,fetched_count,file_list=rs

total_fetched=share_user['file_fetched']+fetched_count

print 'fetched_file_count:%d'%fetched_count

if total_fetched>=total_count or total_count==0:

share_user['file_done']=1#该分享者所有文件爬取完成

if total_count==0:

self.db.execute("UPDATE spider_list set file_done=%s WHERE sid=%s",(1,share_user['sid']))

self.db.commit()

else:

try:

files_count=0

for file in file_list:

files_count+=1

ext=''

file_type=''

file_type_i=-1

if file['isdir']==0 and file['feed_type']=='share':

ext = metautils.get_extension(file['title']).lower()

file_type = metautils.get_category(ext)

file_type_i=self.file_type_t[file_type]

time_stamp=int(time.time())

self.db.execute("INSERT INTO share_file (title,uk,user_name,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(file['title'],file['uk'],file['username'],file['shareid'], file['shorturl'],file['isdir'],file['size'],file['md5'],ext,file['feed_time'],time_stamp,file_type_i,share_user['uid'],file['feed_type'])

)

except:

share_user['file_done']=0

self.db.rollback()

traceback.print_exc()

return False

else:

self.db.execute("UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s",(total_fetched,share_user['file_done'],share_user['sid']))

self.db.execute("UPDATE share_users set fetched=%s WHERE uid=%s",(total_fetched,share_user['uid']))

share_user['file_fetched']=total_fetched

self.got_files_count+=files_count

self.db.commit()

#爬取完文件后在爬取订阅列表

if share_user['follow_done']==0 and share_user['file_done']==1:

print '%d now spidering follow ,%d follow fetched'%(share_user['uk'],share_user['follow_fetched'])

rs=self.getFollows(share_user['uk'],share_user['follow_fetched'])

if not rs:

print 'error to fetch follows,try again later...'

return

total_count,fetched_count,follow_list=rs

total_fetched=share_user['follow_fetched']+fetched_count

print 'fetched_follow_count:%d'%fetched_count

if total_fetched>=total_count or total_count==0:

share_user['follow_done']=1

if total_count==0:

self.db.execute("DELETE FROM spider_list WHERE sid=%s",(share_user['sid'],))

self.db.commit()

else:

try:

follow_count=0

for follow in follow_list:

follow_count+=1

#判断该用户是否已经在表中了

if self.db.execute('SELECT * FROM share_users WHERE uk=%s',(follow['follow_uk'],))>0:

print 'uk:%d has already in share_user table'%follow['follow_uk']

continue

time_stamp=int(time.time())

self.db.execute("INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\

fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(

follow['follow_uk'],follow['follow_uname'],follow['avatar_url'],follow['intro'],follow['follow_count'],

follow['album_count'],follow['fans_count'],follow['pubshare_count'],time_stamp,time_stamp,5

)

#将获取的新分享者加入爬取列表

self.db.execute("INSERT INTO spider_list (uk,uid) VALUES(%s,%s)",(follow['follow_uk'],self.db.last_row_id()))

except:

share_user['follow_done']=0

self.db.rollback()

traceback.print_exc()

return False

else:

if share_user['follow_done']==1:

#订阅者爬取完成，该分享者的任务完成，从待爬取列表中删除

print 'delete follow fetched sid:%d from spider_list'%share_user['sid']

self.db.execute("DELETE FROM spider_list WHERE sid=%s",(share_user['sid'],))

else:

self.db.execute("UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s",(total_fetched,share_user['follow_done'],share_user['sid']))

share_user['follow_fetched']=total_fetched

self.got_follow_count+=follow_count

self.db.commit()

#只要分享者列表没完成，说明该分享者还未爬取完，则加入工作队列，继续爬取

if share_user['follow_done']==0:

self.spider_queue.put(share_user)

else:

print '%d has done'%share_user['uk']

del share_user

time.sleep(SPIDER_INTERVAL)

print '-----------------Done------------------'

print 'while_count:%d'%self.while_count

print 'got_follow_count:%d'%self.got_follow_count

print 'got_files_count:%d'%self.got_files_count

return True

def stop(self):

pass

if __name__ == "__main__":

parser = argparse.ArgumentParser()

parser.add_argument("--seed-user", help="get seed user", action="store_true")

args = parser.parse_args()

spider=BaiduPanSpider()

# 做种

if args.seed_user:

spider.seedUsers()

else:

while(1):

print 'start spider...'

result=spider.startSpider()

if not result:

print 'The spider is refused,5 mins later try again auto...'

time.sleep(60*5)

else:

print 'one worker queue id done'

time.sleep(1)