新闻  |   论坛  |   博客  |   在线研讨会
Python 爬虫小记
tanry111 | 2018-09-11 10:19:25    阅读:254   发布文章

import urllib2

import re



def download(url):

    print "Downloading:",url

    try:

        html=urllib2.urlopen(url).read()

    except urllib2.URLError as e:

        print "Download error:",e.reason

        html=None

    return html

def crawl_sitemap(url):

    sitemap=download(url)

    links=re.findall('<a href="(.*?)" title',sitemap)

    txt=open("123.txt","w",)

    print links

    try:

        for link in links:

            html=download(link)

            page=re.findall('<p>(.*?)</p>',html)

    #        print page

            for i in page:

                txt.write(i)

                txt.write("\n")

    except Exception as e:

        print "Download error:",e

        html=None

    txt.close()

        

url="https://www.douban.com/group/shanghaizufang/"

crawl_sitemap(url)  

      


参与讨论
登录后参与讨论
推荐文章
最近访客