IT虾米网

简单的电子邮件爬虫Python代码

shasha 2018年06月24日 编程语言 1097 0
import requests 
import re 
try: 
    from urllib.parse import urljoin 
except ImportError: 
    from urlparse import urljoin 
 
# regex 
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') 
link_re = re.compile(r'href="(.*?)"') 
 
 
def crawl(url): 
 
    result = set() 
 
    req = requests.get(url) 
 
    # Check if successful 
    if(req.status_code != 200): 
        return [] 
 
    # Find links 
    links = link_re.findall(req.text) 
 
    print("\nFound {} links".format(len(links))) 
 
    # Search links for emails 
    for link in links: 
 
        # Get an absolute URL for a link 
        link = urljoin(url, link) 
 
        # Find all emails on current page 
        result.update(email_re.findall(req.text)) 
 
    return result 
 
if __name__ == '__main__': 
    emails = crawl('http://www.realpython.com') 
 
    print("\nScrapped e-mail addresses:") 
    for email in emails: 
        print(email) 
    print("\n")

评论关闭
IT虾米网

微信公众号号:IT虾米 (左侧二维码扫一扫)欢迎添加!