Python 抓取微信文章图片

WechatIMG194.jpeg

Python 抓取微信文章图片

  • 创建 crawl_image.py,代码如下。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#  -*- coding: utf-8 -*-
# !python3
import re
import os
import ssl
import requests
from bs4 import BeautifulSoup

def getHTMLText(url):
'''
Get page information
'''
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""

def getimgURL(html):
'''
Parse the webpage and get all the image urls
'''
soup = BeautifulSoup(html , "html.parser")
adlist = []
for i in soup.find_all("img"):
try:
ad = re.findall(r'.*src="(.*?)?" .*',str(i))
if ad :
adlist.append(ad)
except:
continue
del adlist[-1]
return adlist

#
def download(adlist):
'''
Create a new folder pic,download and
save the crawled picture information
'''
root = "pic/"
for i in range(len(adlist)):
path = root+str(i)+"."+'png'
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(adlist[i][0])
with open(path, 'wb') as f:
f.write(r.content)
f.close()

ssl._create_default_https_context = ssl._create_unverified_context
url = 'https://mp.weixin.qq.com/s/5A8ooI15XtS6FR3yo4AHpg'
html = getHTMLText(url)
list = getimgURL(html)
download(list)
  • 执行
1
$ python3 crawl_image.py
  • 查看 pic 目录
1
2
$ ls pic
0.png 1.png 2.png 3.png 4.png 5.png 6.png 7.png
-------------本文结束感谢您的阅读-------------
0%