问题
I have a problem to find something with bs4.
I'm trying to automatically find some urls in an html instagram page and (knowing that I'm a python noob) I can't find the way to search automatically within the html source code the urls who are in the exemple after the "display_url": http..."
.
I want to make my script search multiples url who appears as next as "display_url" and download them. They have to be extracted as many times as they appear in the source code.
With bs4 I tried the :
f = urllib.request.urlopen(fileURL)
htmlSource = f.read()
soup = bs(htmlSource, 'html.parser')
metaTag = soup.find_all('meta', {'property': 'og:image'})
imgURL = metaTag[0]['content']
urllib.request.urlretrieve(imgURL, 'fileName.jpg')
But I can't make the soup.find_all(...
work/search it.
Is there a way for me to find this part of the page with bs4 ?
Thanks a lot for your help.
Here is an exemple of my little (python) code as it is now : https://repl.it/@ClementJpn287/bs
<!––cropped...............-->
<body class="">
<span id="react-root"><svg width="50" height="50" viewBox="0 0 50 50" style="position:absolute;top:50%;left:50%;margin:-25px 0 0 -25px;fill:#c7c7c7">
<path
d="
<!––deleted part for privacy -->
" />
</svg></span>
<script type="text/javascript">
window._sharedData = {
"config": {
"csrf_token": "",
"viewer": {
<!––deleted part for privacy -->
"viewerId": ""
},
"supports_es6": true,
"country_code": "FR",
"language_code": "fr",
"locale": "fr_FR",
"entry_data": {
"PostPage": [{
"graphql": {
"shortcode_media": {
"__typename": "GraphSidecar",
<!––deleted part for privacy -->
"dimensions": {
"height": 1080,
"width": 1080
},
"gating_info": null,
"media_preview": null,
<--There's the important part that have to be extracted as many times it appear in the source code-->
"display_url": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"display_resources": [{
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 640,
"config_height": 640
}, {
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 750,
"config_height": 750
}, {
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 1080,
"config_height": 1080
}],
"is_video": false,
<!––cropped...............-->
my newest code
回答1:
You could find the appropriate script tag and regex out the info. I have assumed the first script tag containing window._sharedData =
is the appropriate one. You can fiddle as required.
from bs4 import BeautifulSoup as bs
import re
html = '''
<html>
<head></head>
<body class="">
<span id="react-root">
<svg width="50" height="50" viewbox="0 0 50 50" style="position:absolute;top:50%;left:50%;margin:-25px 0 0 -25px;fill:#c7c7c7">
<path d="
<!––deleted part for privacy -->
" />
</svg></span>
<script type="text/javascript">
window._sharedData = {
"config": {
"csrf_token": "",
"viewer": {
<!––deleted part for privacy -->
"viewerId": ""
},
"supports_es6": true,
"country_code": "FR",
"language_code": "fr",
"locale": "fr_FR",
"entry_data": {
"PostPage": [{
"graphql": {
"shortcode_media": {
"__typename": "GraphSidecar",
<!––deleted part for privacy -->
"dimensions": {
"height": 1080,
"width": 1080
},
"gating_info": null,
"media_preview": null,
<--There's the important part that have to be extracted as many times it appear in the source code-->
"display_url": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"display_resources": [{
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 640,
"config_height": 640
}, {
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 750,
"config_height": 750
}, {
"src": "https://scontent-cdt1-1.cdninstagram.com/vp/",
"config_width": 1080,
"config_height": 1080
}],
"is_video": false,</script>
</body>
</html>
'''
soup = bs(html, 'lxml')
scripts = soup.select('script[type="text/javascript"]')
for script in scripts:
if ' window._sharedData =' in script.text:
data = script.text
break
r = re.compile(r'"display_url":(.*)",')
print(r.findall(data))
Thanks to @t.h.adam it may be possible to shorten the above to:
soup = bs(html, 'lxml')
r = re.compile(r'"display_url":(.*)",')
data = soup.find('script', text=r).text
print(r.findall(data))
回答2:
The program advanced and it became something like this :
thepage = urllib.request.urlopen(html)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
txt = soup.select('script[type="text/javascript"]')[3]
texte = txt.get_text()
f1 = open("tet.txt", 'w')
f1.write(texte)
f1.close()
with open('tet.txt','r') as f:
data=''.join(f.readlines())
print(data[data.index('"display_url":"'):data.index('","display_resources":')+1])
But now something new appeared :
- How to make the finding url part of the program (line 10, 11) repeat as long as the (' "display_url":" to --> ","display_resources": ') appear in the tet.txt file ?
- The while loop can be used but how to make it repeat the process ?
回答3:
Problem Solved
Here's the code to download multiples images from an instagram url with Pythonista 3 on iOS:
from sys import argv
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import photos
import clipboard
thepage = "your url"
#p.1
thepage = urllib.request.urlopen(html)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
txt = soup.select('script[type="text/javascript"]')[3]
texte = txt.get_text()
fille = open("tet.txt", 'w')
fille.write(texte)
fille.close()
#p.2
g = open('tet.txt','r')
data=''.join(g.readlines())
le1 = 0
le2 = 0
hturl = open('url.html', 'w')
still_looking = True
while still_looking:
still_looking = False
dat = data.find('play_url', le1)
det = data.find('play_resources', le2)
if dat >= le1:
#urls.append(dat)
le1 = dat + 1
still_looking = True
if det >= le2:
hturl.write(data[dat:det])
le2 = det + 1
still_looking = True
hturl.close()
#p.3
hturl2 = open('url.html', 'r')
dete = ''.join(hturl2.readlines())
le11 = 0
le22 = 0
urls = []
still_looking2 = True
while still_looking2:
still_looking2 = False
dat2 = dete.find('https://scontent-', le11)
det2 = dete.find('","dis', le22)
if dat2 >= le11:
urls.append(dat2)
le11 = dat2 + 1
still_looking2 = True
if det2 >= le22:
urls.append(dete[dat2:det2])
le22 = det2 + 1
still_looking2 = True
hturl2.close()
#p.4
imgs = len(urls)
nbind = imgs
nbindr = 3
images = 1
while nbindr < imgs:
urllib.request.urlretrieve(urls[nbindr], 'photo.jpg')
photos.create_image_asset('photo.jpg')
print ('Image ' + str(images) + ' downloaded')
nbindr = nbindr +2
images += 1
print("OK")
It's a bit fastidious but it's working and rapidly too. Thanks for your help.
来源:https://stackoverflow.com/questions/55191164/beautifulsoup-find-within-an-instagram-html-page