xbmc/kodi python scrap data using BeautifulSoup
Date : March 29 2020, 07:55 AM
Hope that helps i want to edit some kodi addon that use re.compile to scrap data into BeautifulSoup4 the original code is like this from bs4 import BeautifulSoup
html = """<div id="content">
<span class="someclass">
<span class="sec">
<a class="frame" href="http://somlink.com/section/name-here" title="name here">
<img src="http://www.somlink.com/thumb/imgsection/thumbnail.jpg" >
</a>
</span>
<h3 class="title">
<a href="http://somlink.com/section/name-here">name here</a>
</h3>
<span class="details"><span class="length">Length: 99:99</span>
</span>
</div>
"""
soup = BeautifulSoup(html, "lxml")
sec = soup.find("span", {"class": "someclass"})
# get a tag with frame class
fr = sec.find("a", {"class": "frame"})
# pull img src and href from the a/frame
url, img = fr["href"], fr.find("img")["src"]
# get h3 with title class and extract the text from the anchor
name = sec.select("h3.title a")[0].text
# "size" is in the span with the details class
size = sec.select("span.details")[0].text.split(None,1)[-1]
print(url, img, name.strip(), size.split(None,1)[1].strip())
('http://somlink.com/section/name-here', 'http://www.somlink.com/thumb/imgsection/thumbnail.jpg', u'name here', u'99:99')
def secs():
soup = BeautifulSoup(html, "lxml")
sections = soup.find_all("span", {"class": "someclass"})
for sec in sections:
fr = sec.find("a", {"class": "frame"})
url, img = fr["href"], fr.find("img")["src"]
name, size = sec.select("h3.title a")[0].text, sec.select("span.details")[0].text.split(None,1)[-1]
yield url, name, img,size
sec.find("img")["src"]
|
Scrap the article with Python 3.4 and BeautifulSoup ,Requests
Date : March 29 2020, 07:55 AM
I hope this helps you . The desired data is not actually located inside the element with status-list class. If you would inspect the source, you would find an empty container instead: <div class="status_bd">
<div id="statusLists" class="allStatuses no-head"></div>
</div>
import json
import re
import requests
from bs4 import BeautifulSoup
url = 'https://xueqiu.com/yaodewang'
headers = {
'user-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'
}
r = requests.get(url, headers=headers).content
soup = BeautifulSoup(r, 'lxml')
pattern = re.compile(r"SNB\.data\.statuses = ({.*?});", re.MULTILINE | re.DOTALL)
script = soup.find("script", text=pattern)
data = json.loads(pattern.search(script.text).group(1))
for item in data["statuses"]:
print(item["description"])
The best advice: Remember common courtesy and act toward others as you want them to act toward you.
Lighten up! It's the weekend. we're just having a little fun! Industrial Bank is expected to rise,next week...
...
点.点.点... 点到这个,学位、学历、成绩单翻译一下要50块、100块的...
|
Data Scrap from a website to a csv file format using python and beautifulsoup
Tag : python , By : user169463
Date : March 29 2020, 07:55 AM
Hope that helps This should get you started. I'll break it down a bit too for you so you can modify and play while you're learning. I'm also suggesting to use Pandas, as it's a popular library for data manipulation and you'll be using in the near future if you're already not using it I first initialize a results dataframe to store all the data you'll be parsing: import bs4
import requests
import pandas as pd
results = pd.DataFrame()
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
results.to_csv('path/file.csv', index=False)
import bs4
import requests
import pandas as pd
results = pd.DataFrame()
my_url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&N=-1&isNodeId=1'
response = requests.get(my_url)
html = response.text
soup = bs4.BeautifulSoup(html, 'html.parser')
Container_Main = soup.find_all("div",{"class":"item-container"})
for container in Container_Main:
item_features = container.find("ul",{"class":"item-features"})
# if there are no item-fetures, move on to the next container
if item_features == None:
continue
temp_df = pd.DataFrame(index=[0])
features_list = item_features.find_all('li')
for feature in features_list:
split_str = feature.text.split(':')
header = split_str[0]
data = split_str[1].strip()
temp_df[header] = data
promo = container.find_all("p",{"class":"item-promo"})[0].text
temp_df['promo'] = promo
results = results.append(temp_df, sort = False).reset_index(drop = True)
results.to_csv('path/file.csv', index=False)
|
scrap top 100 job results from indeed using BeautifulSoup python
Date : March 29 2020, 07:55 AM
should help you out Do it in batches of 10 changing the start value in the url. You can loop incrementing and adding the add variable https://www.indeed.co.in/jobs?q=software+developer&l=Bengaluru%2C+Karnataka&start=0import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
results = []
url = 'https://www.indeed.co.in/jobs?q=software+developer&l=Bengaluru,+Karnataka&start={}'
with requests.Session() as s:
for page in range(5):
res = s.get(url.format(page))
soup = bs(res.content, 'lxml')
titles = [item.text.strip() for item in soup.select('[data-tn-element=jobTitle]')]
companies = [item.text.strip() for item in soup.select('.company')]
data = list(zip(titles, companies))
results.append(data)
newList = [item for sublist in results for item in sublist]
df = pd.DataFrame(newList)
df.to_json(r'C:\Users\User\Desktop\data.json')
|
how to scrap the data from the url in python using beautifulsoup
Date : March 29 2020, 07:55 AM
help you fix your problem Using the following url: https://desiopt.com/search-results-jobs/?action=search&page=&listings_per_page=&view=listimport requests
from bs4 import BeautifulSoup
import csv
links = []
try:
for item in range(1, 372):
print(f"Extraction Page# {item}")
r = requests.get(
f"https://desiopt.com/search-results-jobs/?action=search&page={item}&listings_per_page=100&view=list")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('span', attrs={'class': 'captions-field'}):
for a in item.findAll('a'):
a = a.get('href')
if a not in links:
links.append(a)
except KeyboardInterrupt:
print("Good Bye!")
exit()
data = []
try:
for link in links:
r = requests.get(link)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('div', attrs={'class': 'compProfileInfo'}):
a = [a.text.strip() for a in item.findAll('span')]
if a[6] == '':
a[6] = 'N/A'
data.append(a[0:7:2])
except KeyboardInterrupt:
print("Good Bye!")
exit()
while True:
try:
with open('output.csv', 'w+', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Name', 'Phone', 'Email', 'Website'])
writer.writerows(data)
print("Operation Completed")
except PermissionError:
print("Please Close The File")
continue
except KeyboardInterrupt:
print("Good Bye")
exit()
break
|