General web page light scraper.

teckk

from LinuxQuestions.org on 2021-06-29 12:18 (#5KNGA)

For the member that posted here about a scraper, or anyone else who is interested:
https://www.linuxquestions.org/quest...39#post6262339

If what you are wanting is a general light scraper for web pages that will spit out web page info, and since you are referencing python. This is one of my demo scripts that I made and have in my python cache. I have another larger one that uses buttons to display one at time instead of all at once.

It needs python3, urllib, tkinter, beautiful soup, html2text.
Modify it as you wish.

It's not meant to be efficient. In fact soup loops several times appending to lists making it slow. I did it that way for a demo for me to reference. And it is small enough for a 1 script file, and small enough to post. You can un-remark one url at a time and see the output. (lines 32-36)

I checked it before posting. It still works.
Code:#!/usr/bin/python

#ttk.Notebook tabs get web page info demo

from tkinter import *
from tkinter import ttk
from urllib import request, parse
from bs4 import BeautifulSoup
from html2text import html2text, HTML2Text

#Edit user agents here
a = ('Mozilla/5.0 (Windows NT 10.0; x86_64; rv:88.0) '
'Gecko/20100101 Firefox/88.0')
b = ('Mozilla/5.0 (X11; Arch; Linux x86_64; rv:88.0) '
'Gecko/20100101 Firefox/88.0')
c = ('Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) '
'AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148')
d = ('Mozilla/5.0 (compatible; Googlebot/2.1); '
'+ http://www.google.com/bot.html')

#Choose user agent here
agent = c
#Make request header for picky servers like cloudflare
uagent = {'User-Agent': agent,
'Accept': 'text/html,application/xhtml+xml,'
'application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}

#url = 'file:///path/to/MyLocalFile.html'
url = 'https://www.linuxquestions.org/questions/'
#url = 'http://books.toscrape.com'
#url = 'http://ipchicken.com'
#url = 'https://www.bbc.co.uk/iplayer/categories/films/most-recent?page=1'

cacheName = 'MyPageCache'

#Get page source with urllib, save to cache.
class getSource():
def __init__(self, url, agent):
def source(self):
self.req = request.Request(url, data=None, headers=agent)
self.get = request.urlopen(self.req)
self.html = self.get.read().decode('utf-8', 'ignore')
#Download page to cache file
with open(cacheName+'.html', 'w') as f:
f.write(self.html)
source(self)

#Parse source from cache with beautifulsoup
class parseSoup():
def __init__(self, url):
def getSoup(self):
#Open cache file for parsing
with open(cacheName+'.html', 'r') as f:
self.soup = BeautifulSoup(f, 'html.parser')

ln, sf, cf, im = [],[],[],[]
#Get links on page
for link in self.soup.find_all('a'):
if link.attrs.get('href'):
lnk = parse.urljoin(url, link.attrs.get('href'))
ln.append(lnk)

#Get CSS on page
for css in self.soup.find_all('link'):
if css.attrs.get('href'):
cu = parse.urljoin(url, css.attrs.get("href"))
cf.append(cu)

#Get Scripts on page
for sc in self.soup.find_all('script'):
if sc.attrs.get('src'):
su = parse.urljoin(url, sc.attrs.get('src'))
sf.append(su)

#Get Images on page
for i in self.soup.find_all('img'):
if i.attrs.get('src'):
iu = parse.urljoin(url, i.attrs.get('src'))
im.append(iu)

self.txt = '\n\n'.join(x.strip()
for x in self.soup.get_text().splitlines() if x.strip())

self.lnk = '\n\n'.join(x for x in ln)
self.scr = '\n\n'.join(x for x in sf)
self.css = '\n\n'.join(str(x) for x in cf)
self.ima = '\n\n'.join(x for x in im)
self.sou = self.soup
return self.txt, self.lnk, self.scr, self.css, self.sou, self.ima
getSoup(self)

#Parse source from cache with html2text
class parseHtml2text():
def __init__(self):
def getHtml2txt(self):
with open(cacheName+'.html', 'r') as f:
scode = ''.join(x for x in f if x)
noLinks = HTML2Text()
noLinks.ignore_links = True
self.txt = noLinks.handle(scode)
return self.txt
getHtml2txt(self)

#Tk Browser
class TkBrowser(Tk):
def __init__(self):
Tk.__init__(self)
self._frame = None
self.title('Web Page Inspector')
self.geometry('1200x800')
self.switch_frame(NoteBook)

self.mainloop()

#Frame switcher
def switch_frame(self, frame_class):
new_frame = frame_class(self)
if self._frame is not None:
self._frame.destroy()
self._frame = new_frame
self._frame.pack()

#ttk.Notebook
class NoteBook(Frame):
def __init__(self, master):
Frame.__init__(self, master)
self.notebook = ttk.Notebook()
style = ttk.Style()
style.configure("TNotebook", background='grey50')
style.configure('TNotebook.Tab', font=('Monospace', '20'),
background='lightblue', foreground='black')
style.map("TNotebook.Tab", background=[('selected', 'lightyellow')],
foreground=[('selected', 'black')])
self.tab1 = Tab1(self.notebook)
self.tab2 = Tab2(self.notebook)
self.tab3 = Tab3(self.notebook)
self.tab4 = Tab4(self.notebook)
self.tab5 = Tab5(self.notebook)
self.tab6 = Tab6(self.notebook)
self.tab7 = Tab7(self.notebook)
self.notebook.add(self.tab1, text="SoupText")
self.notebook.add(self.tab2, text="Links")
self.notebook.add(self.tab3, text="Scripts")
self.notebook.add(self.tab4, text="CSS")
self.notebook.add(self.tab5, text="Source")
self.notebook.add(self.tab6, text="Html2Text")
self.notebook.add(self.tab7, text="Images")
self.notebook.pack(expand=True, fill='both')

#Notebook Tab1 Text, info for other tabs
class Tab1(Frame):
def __init__(self, master):
Frame.__init__(self, master)

#Text widget with a scrollbar
self.scroll = Scrollbar(self)
self.scroll.config(width=25, borderwidth=3, bg='gray50')
self.scroll.pack(side='right', fill='y')

self.text = Text(self, yscrollcommand=self.scroll.set)
self.text.config(font=('Monospace',20), highlightcolor='blue',
highlightthickness=3, borderwidth=5, bg='lightyellow')
self.scroll.config(command=self.text.yview)
self.text.pack(fill='both', expand=True)

#Download the file to cache
getSource(url, uagent)
#Parse it
self.p = parseSoup(url)
#Get lists
self.txt = self.p.txt
self.lnk = self.p.lnk
self.scr = self.p.scr
self.css = self.p.css
self.sou = self.p.sou
self.ima = self.p.ima
self.h = parseHtml2text()
self.h2t = self.h.txt

#Clear the text widget, put something in it
self.text.delete(1.0, END)
self.text.insert(1.0, self.txt)

#Notebook Tab 2 Links
class Tab2(Frame):
def __init__(self, master):
self.lnk = Tab1(master).lnk
Frame.__init__(self, master)

self.scroll = Scrollbar(self)
self.scroll.config(width=25, borderwidth=3, bg='gray50')
self.scroll.pack(side='right', fill='y')
self.text = Text(self, yscrollcommand=self.scroll.set)
self.text.config(font=('Monospace',20), highlightcolor='blue',
highlightthickness=3, borderwidth=5, bg='lightyellow')
self.scroll.config(command=self.text.yview)
self.text.pack(fill='both', expand=True)
self.text.delete(1.0, END)
self.text.insert(1.0, self.lnk)

#Notebook Tab 3 Scripts
class Tab3(Frame):
def __init__(self, master):
self.scr = Tab1(master).scr
Frame.__init__(self, master)

self.scroll = Scrollbar(self)
self.scroll.config(width=25, borderwidth=3, bg='gray50')
self.scroll.pack(side='right', fill='y')

self.text = Text(self, yscrollcommand=self.scroll.set)
self.text.config(font=('Monospace',20), highlightcolor='blue',
highlightthickness=3, borderwidth=5, bg='lightyellow')
self.scroll.config(command=self.text.yview)
self.text.pack(fill='both', expand=True)
self.text.delete(1.0, END)
self.text.insert(1.0, self.scr)

#Notebook Tab 4 CSS
class Tab4(Frame):
def __init__(self, master):
self.css = Tab1(master).css
Frame.__init__(self, master)

self.scroll = Scrollbar(self)
self.scroll.config(width=25, borderwidth=3, bg='gray50')
self.scroll.pack(side='right', fill='y')

self.text = Text(self, yscrollcommand=self.scroll.set)
self.text.config(font=('Monospace',20), highlightcolor='blue',
highlightthickness=3, borderwidth=5, bg='lightyellow')
self.scroll.config(command=self.text.yview)
self.text.pack(fill='both', expand=True)
self.text.delete(1.0, END)
self.text.insert(1.0, self.css)

#Notebook Tab5 Source
class Tab5(Frame):
def __init__(self, master):
self.sou = Tab1(master).sou
Frame.__init__(self, master)

self.scroll = Scrollbar(self)
self.scroll.config(width=25, borderwidth=3, bg='gray50')
self.scroll.pack(side='right', fill='y')

self.text = Text(self, yscrollcommand=self.scroll.set)
self.text.config(font=('Monospace',20), highlightcolor='blue',
highlightthickness=3, borderwidth=5, bg='lightyellow')
self.scroll.config(command=self.text.yview)
self.text.pack(fill='both', expand=True)
self.text.delete(1.0, END)
self.text.insert(1.0, self.sou)

#Notebook Tab6 Html2Text
class Tab6(Frame):
def __init__(self, master):
self.h2t = Tab1(master).h2t
Frame.__init__(self, master)

self.scroll = Scrollbar(self)
self.scroll.config(width=25, borderwidth=3, bg='gray50')
self.scroll.pack(side='right', fill='y')

self.text = Text(self, yscrollcommand=self.scroll.set)
self.text.config(font=('Monospace',20), highlightcolor='blue',
highlightthickness=3, borderwidth=5, bg='lightyellow')
self.scroll.config(command=self.text.yview)
self.text.pack(fill='both', expand=True)
self.text.delete(1.0, END)
self.text.insert(1.0, self.h2t)

#Notebook Tab7 Images
class Tab7(Frame):
def __init__(self, master):
self.ima = Tab1(master).ima
Frame.__init__(self, master)

self.scroll = Scrollbar(self)
self.scroll.config(width=25, borderwidth=3, bg='gray50')
self.scroll.pack(side='right', fill='y')

self.text = Text(self, yscrollcommand=self.scroll.set)
self.text.config(font=('Monospace',20), highlightcolor='blue',
highlightthickness=3, borderwidth=5, bg='lightyellow')
self.scroll.config(command=self.text.yview)
self.text.pack(fill='both', expand=True)
self.text.delete(1.0, END)
self.text.insert(1.0, self.ima)

if __name__ == "__main__":
TkBrowser()

latest?i=YSDz8puKLIg:wflBDm0Bh_k:F7zBnMy

latest?i=YSDz8puKLIg:wflBDm0Bh_k:V_sGLiP

latest?i=YSDz8puKLIg:wflBDm0Bh_k:gIN9vFw

Source	RSS or Atom Feed
Feed Location	https://feeds.feedburner.com/linuxquestions/latest
Feed Title	LinuxQuestions.org
Feed Link	https://www.linuxquestions.org/questions/