Use Request Library to obtain webpages
import requests
topics_url="https://github.com/subjects"
response = requests.get(topics_url)
response.status_code
200
len(response.textual content)
177177
page_contents = response.textual content
page_contents[:1000]
'nn<!DOCTYPE html>n<html lang="en" data-color-mode="auto" data-light-theme="gentle" data-dark-theme="darkish">n <head>n <meta charset="utf-8">n <hyperlink rel="dns-prefetch" href="https://github.githubassets.com">n <hyperlink rel="dns-prefetch" href="https://avatars.githubusercontent.com">n <hyperlink rel="dns-prefetch"....
Write web page as a file
with open('webpage.html', 'wb') as f:
f.write(page_contents.encode("utf8"))
Parse and extract info
from bs4 import BeautifulSoup
doc = BeautifulSoup(page_contents, 'html.parser')
selection_class="f3 lh-condensed mb-0 mt-1 Hyperlink--primary"
topic_title_tags = doc.find_all('p', {'class': selection_class})
len(topic_title_tags)
30
topic_title_tags[:5]
[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]
selection_class="f5 color-fg-muted mb-0 mt-1"
topic_desc_tags = doc.find_all('p', {'class': selection_class})
len(topic_desc_tags)
30
topic_desc_tags[:5]
[<p class="f5 color-fg-muted mb-0 mt-1">
3D modeling is the process of virtually developing the surface and structure of a 3D object.
</p>,
<p class="f5 color-fg-muted mb-0 mt-1">
Ajax is a technique for creating interactive web applications.
</p>,
<p class="f5 color-fg-muted mb-0 mt-1">
Algorithms are self-contained sequences that carry out a variety of tasks.
</p>,.....
topic_title_tag0 = topic_title_tags[0]
div_tag = topic_title_tag0.mother or father
topic_link_tags = doc.find_all('a', {'class': 'flex-grow-0'})
len(topic_link_tags)
30
topic0_url = "https://github.com" + topic_link_tags[0]['href']
print(topic0_url)
https://github.com/subjects/3d
topic_titles = []
for tag in topic_title_tags:
topic_titles.append(tag.textual content)
print(topic_titles)
['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']
topic_descs = []
for tag in topic_desc_tags:
topic_descs.append(tag.textual content.strip())
topic_descs[:5]
['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
'Ajax is a technique for creating interactive web applications.',
'Algorithms are self-contained sequences that carry out a variety of tasks.',
'Amp is a non-blocking concurrency library for PHP.',
'Android is an operating system built by Google designed for mobile devices.']
topic_urls = []
base_url = "https://github.com"
for tag in topic_link_tags:
topic_urls.append(base_url + tag['href'])
topic_urls
['https://github.com/topics/3d',
'https://github.com/topics/ajax',
'https://github.com/topics/algorithm',
'https://github.com/topics/amphp',
'https://github.com/topics/android',
'https://github.com/topics/angular',
'https://github.com/topics/ansible',
'https://github.com/topics/api',
'https://github.com/topics/arduino',
'https://github.com/topics/aspnet',
'https://github.com/topics/atom',
'https://github.com/topics/awesome']....
Create CSV FILE
import pandas as pd
topics_dict = {
'title': topic_titles,
'description': topic_descs,
'url': topic_urls
}
topics_df = pd.DataFrame(topics_dict)
topics_df
title description url
0 3D 3D modeling is the method of nearly develo... https://github.com/subjects/3d
1 Ajax Ajax is a way for creating interactive w... https://github.com/subjects/ajax
2 Algorithm Algorithms are self-contained sequences that c... https://github.com/subjects/algorithm
3 Amp Amp is a non-blocking concurrency library for ... https://github.com/subjects/amphp
4 Android Android is an working system constructed by Google... https://github.com/subjects/android
5 Angular Angular is an open supply internet utility plat... https://github.com/subjects/angular
6 Ansible Ansible is an easy and highly effective automation en... https://github.com/subjects/ansible
7 API An API (Utility Programming Interface) is ... https://github.com/subjects/api
8 Arduino Arduino is an open supply {hardware} and softwar... https://github.com/subjects/arduino.....
topics_df.to_csv('subjects.csv', index=None)
Getting info out of a subject web page
topic_page_url = topic_urls[0]
topic_page_url
'https://github.com/subjects/3d'
response = requests.get(topic_page_url)
response.status_code
200
len(response.textual content)
662290
topic_doc = BeautifulSoup(response.textual content, 'html.parser')
h3_selection_class = "f3 color-fg-muted text-normal lh-condensed"
repo_tags = topic_doc.find_all('h3', {'class': h3_selection_class })
len(repo_tags)
30
repo_tags
[<h3 class="f3 color-fg-muted text-normal lh-condensed">
<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-view-component="true" href="https://dev.to/mrdoob">
mrdoob
</a> /
<a class="text-bold wb-break-word" data-ga-click="Explore, go to repository, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d89752156923904a4238816bc9b51ab7772f3e3644ce897d8dd4e5" data-view-component="true" href="https://dev.to/mrdoob/three.js">
three.js....
Write a function that calls out all a tags from repo_tags
a_tags = repo_tags[0].find_all('a')
a_tags
[<a data-ga-click="Explore, go to repository owner, location:explore feed" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4bdbc49d3c05ae7f70b531fbce709a384200b0768554e0172950286a8db30940" data-view-component="true" href="https://dev.to/mrdoob">
mrdoob...]
len(a_tags)
2
a_tags[0].textual content.strip()
'mrdoob'
a_tags[1].textual content.strip()
'three.js'
base_url="https://github.com"
repo_url = base_url + a_tags[1]['href']
repo_url
'https://github.com/mrdoob/three.js'
star_tags = topic_doc.find_all('a', {'class': 'tooltipped tooltipped-s btn-sm btn BtnGroup-item color-bg-default'})
len(star_tags)
30
star_tags
[<a aria-label="You must be signed in to star a repository" class="tooltipped tooltipped-s btn-sm btn BtnGroup-item color-bg-default" data-hydro-click='{"event_type":"authentication.click","payload":{"location_in_page":"star button","repository_id":576201,"auth_type":"LOG_IN","originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="b901d0051f5d392e6990ed43be8259b46c6bc69b73fa228065a5b24cb7acf2cf" data-view-component="true" href="https://dev.to/login?return_to=%2Fmrdoob%2Fthree.js" rel="nofollow"> <svg aria-hidden="true" class="octicon octicon-star v-align-text-bottom d-inline-block mr-2" data-view-component="true" height="16" version="1.1" viewbox="0 0 16 16" width="16">
<path d="M8 .25a.75.75 0 01.673.418l1.882 3.815 4.21.612a.75.75 0 01.416 1.279l-3.046 2.97.719 4.192a.75.75 0 01-1.088.791L8 12.347l-3.766 1.98a.75.75 0 01-1.088-.79l.72-4.194L.818 6.374a.75.75 0 01.416-1.28l4.21-.611L7.327.668A.75.75 0 018 .25zm0 2.445L6.615 5.5a.75.75 0 01-.564.41l-3.097.45 2.24 2.184a.75.75 0 01.216.664l-.528 3.084 2.769-1.456a.75.75 0 01.698 0l2.77 1.456-.53-3.084a.75.75 0 01.216-.664l2.24-2.183-3.096-.45a.75.75 0 01-.564-.41L8 2.694v.001z" fill-rule="evenodd"></path>.....
star_tags[0].textual content
' nnn Starn 78.6kn'
star_tags[0].textual content.strip('nnn Starn')
'78.6k'
def parse_star_count(stars_str):
stars_str = stars_str.strip()
if stars_str[-1] == 'okay':
return int(float(stars_str[:-1]) * 1000)
return int(stars_str)
parse_star_count(star_tags[0].textual content.strip('nnn Starn'))
78600
get_repo_info(repo_tags[0], star_tags[0])
('mrdoob', 'three.js', 78600, 'https://github.com/mrdoob/three.js')
len(repo_tags)
def get_repo_info(h3_tag, star_tag):
# returns all of the required details about a respository
a_tags = h3_tag.find_all('a')
username = a_tags[0].textual content.strip()
repo_name = a_tags[1].textual content.strip()
repo_url = base_url + a_tags[1]['href']
stars = parse_star_count(star_tag.textual content.strip('nnn Starn'))
return username, repo_name, stars, repo_url
topic_repos_dict = {
'username': [],
'repo_name': [],
'stars': [],
'repo_url': []
}
for i in vary(len(repo_tags)):
repo_info = get_repo_info(repo_tags[i], star_tags[i])
topic_repos_dict['username'].append(repo_info[0])
topic_repos_dict['repo_name'].append(repo_info[1])
topic_repos_dict['stars'].append(repo_info[2])
topic_repos_dict['repo_url'].append(repo_info[3])
topic_repos_df = pd.DataFrame(topic_repos_dict)
topic_repos_df
username repo_name stars repo_url
0 mrdoob three.js 78600 https://github.com/mrdoob/three.js
1 libgdx libgdx 19600 https://github.com/libgdx/libgdx
2 pmndrs react-three-fiber 16600 https://github.com/pmndrs/react-three-fiber
3 BabylonJS Babylon.js 15800 https://github.com/BabylonJS/Babylon.js
4 aframevr aframe 13700 https://github.com/aframevr/aframe
5 ssloy tinyrenderer 12000 https://github.com/ssloy/tinyrenderer
6 lettier 3d-game-shaders-for-beginners 12000 https://github.com/lettier/3d-game-shaders-for...
7 FreeCAD FreeCAD 10600 https://github.com/FreeCAD/FreeCAD
8 metafizzy zdog 9000 https://github.com/metafizzy/zdog
9 CesiumGS cesium 8200 https://github.com/CesiumGS/cesium.....
import os
def get_topic_page(topic_url):
# Obtain the web page
response = requests.get(topic_url)
# Verify profitable response
if response.status_code != 200:
increase Exception('Did not load web page {}'.format(topic_url))
# Parse utilizing stunning soup
topic_doc = BeautifulSoup(response.textual content, 'html.parser')
return topic_doc
def get_repo_info(h3_tag, star_tag):
# Returns all of the required details about a respository
a_tags = h3_tag.find_all('a')
username = a_tags[0].textual content.strip()
repo_name = a_tags[1].textual content.strip()
repo_url = base_url + a_tags[1]['href']
stars = parse_star_count(star_tag.textual content.strip('nnn Starn'))
return username, repo_name, stars, repo_url
def get_topic_repos(topic_doc):
# Get h3 tags containing, repo title, repo url and username
h3_selection_class = "f3 color-fg-muted text-normal lh-condensed"
repo_tags = topic_doc.find_all('h3', {'class': h3_selection_class })
# Get star tags
star_tags = topic_doc.find_all('a', {'class': 'tooltipped tooltipped-s btn-sm btn BtnGroup-item color-bg-default'})
# Create operate identify
topic_repos_dict = {
'username': [],
'repo_name': [],
'stars': [],
'repo_url': []
}
# Get repository info
for i in vary(len(repo_tags)):
repo_info = get_repo_info(repo_tags[i], star_tags[i])
topic_repos_dict['username'].append(repo_info[0])
topic_repos_dict['repo_name'].append(repo_info[1])
topic_repos_dict['stars'].append(repo_info[2])
topic_repos_dict['repo_url'].append(repo_info[3])
return pd.DataFrame(topic_repos_dict)
def scrape_topic(topic_url, path):
if os.path.exists(path):
print('The file {} already exists. skipping...'.format(path))
return
topic_df = get_topic_repos(get_topic_page(topic_url))
topic_df.to_csv(path, index=None )
topic_urls[6]
'https://github.com/subjects/ansible'
get_topic_repos(get_topic_page(topic_urls[6])).to_csv('ansible.csv', index=None)
def get_topic_titles(doc):
selection_class="f3 lh-condensed mb-0 mt-1 Hyperlink--primary"
topic_title_tags = doc.find_all('p', {'class': selection_class})
topic_titles = []
for tag in topic_title_tags:
topic_titles.append(tag.textual content)
return topic_titles
def get_topic_descs(doc):
selection_class="f5 color-fg-muted mb-0 mt-1"
topic_desc_tags = doc.find_all('p', {'class': selection_class})
topic_descs = []
for tag in topic_desc_tags:
topic_descs.append(tag.textual content.strip())
return topic_descs
def get_topic_urls(doc):
topic_link_tags = doc.find_all('a', {'class': 'flex-grow-0'})
topic_urls = []
base_url = "https://github.com"
for tag in topic_link_tags:
topic_urls.append(base_url + tag['href'])
return topic_urls
def scrape_topics():
topics_url="https://github.com/subjects"
response = requests.get(topics_url)
if response.status_code != 200:
increase Exception('Did not load web page {}'.format(topic_url))
topics_dict = {
'title': get_topic_titles(doc),
'description': get_topic_descs(doc),
'url': get_topic_urls(doc)
}
return pd.DataFrame(topics_dict)
def scrape_topics_repos():
print('Scraping record of subjects')
topics_df = scrape_topics()
# Create a folder
os.makedirs("knowledge", exist_ok =True)
for index, row in topics_df.iterrows():
print('Scraping high repositories for "{}"'.format(row['title']) )
scrape_topic(row['url'], 'knowledge/{}.csv'.format(row['title']))
scrape_topics_repos()
Scraping record of subjects
Scraping high repositories for "3D"
The file knowledge/3D.csv already exists. skipping...
Scraping high repositories for "Ajax"
The file knowledge/Ajax.csv already exists. skipping...
Scraping high repositories for "Algorithm"
The file knowledge/Algorithm.csv already exists. skipping...
Scraping high repositories for "Amp"
The file knowledge/Amp.csv already exists. skipping...
Scraping high repositories for "Android".....