repl.it
@PythinPython/

Python Google Web Scraper Version 1

Python

No description

fork
loading
Files
  • main.py
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import replit

notgooglestringchars = ['`', '~', '@', '#', '$', '%', '^', '&', '*', '(', ')', '"', '\'', '{', '}', '|', '\\', '-', '_', '+', '=', '[', ']', ':', ';', '<', '>', '/', '?', ',', '.']
continuesearch = True
#LC20lb is website title class
#e24kjd is website content highlighted by Google
print("This is a Python3 based webscraper. \nIt is meant to enable a user to perform google searches without having to visit websites. \nPlease enter a search.")
usersearch = input('Search: ')
print('Searching...' + usersearch.replace(' ', '+'))
for i in range(len(notgooglestringchars)):
	if notgooglestringchars[i] in usersearch:
		print('Characters in search unable to process.')
		continuesearch = False
		break
if continuesearch == True:
	webscrapesoup = BeautifulSoup(requests.get('https://www.google.com/search?q=' + usersearch.replace(' ', '+')).content, "html.parser")
	removefromresultchars = ['<div class="BNeawe s3v9rd AP7Wnd">', '<div>', '<span class="FCUp0c rQMQod">', '</div>', '</span>', '<span class="FCUp0c rQMQod">noun', '<div class="v9i61e>"', '<div class="Ap5OSd">', '<span class="r0bn4c rQMQod"', '<br/', '>noun']

	#RESULT
	finalresult = str(webscrapesoup.find_all("div", {"class":"BNeawe s3v9rd AP7Wnd"})[0])
	for removeablechar in removefromresultchars:
		finalresult = finalresult.replace(removeablechar, '')
	print('\nRESULT - \n' + finalresult[0:finalresult.find('<span class="BNeawe"')])

	#SOURCE
	#print(webscrapesoup.find_all("span", {"class":"rQMQod"}))
	foundsource = ''
	for fromline in webscrapesoup.find_all("span", {"class":"rQMQod"}):#[0:len(webscrapesoup) - 7]:
		if str(fromline).replace('<span class="FCUp0c rQMQod">', '').replace('<span class="rQMQod Xb5VRe">', '').replace('<span class="rQMQod aJyiOc">', '').replace('<span class="r0bn4c rQMQod">', '').replace('</span>', '').replace(' ', '')[0:4] == 'http':
			foundsource = True
			print('\nSOURCE - ' + str(fromline).replace('<span class="FCUp0c rQMQod">', '').replace('<span class="rQMQod Xb5VRe">', '').replace('<span class="rQMQod aJyiOc">', '').replace('<span class="r0bn4c rQMQod">', '').replace('</span>', ''))
	if foundsource != True:
		print('\nSOURCE - No source was found.')

	#Other Results
	previousfinalresult = ''
	print('\nOther Related Results - ')
	for otherresultnumber in range(len(webscrapesoup.find_all("div", {"class":"BNeawe s3v9rd AP7Wnd"}))):
		finalresult = str(webscrapesoup.find_all("div", {"class":"BNeawe s3v9rd AP7Wnd"})[otherresultnumber])
		for removeablechar in removefromresultchars:
			finalresult = finalresult.replace(removeablechar, '')
		finalresult = finalresult[0:finalresult.find('<span class="BNeawe"')]
		if finalresult.find('<span>') >= 0:
			print('Result unable to load.')
		else:
			if finalresult.find('> · ') != -1:
				if finalresult[finalresult.find('> · ') + 4:len(finalresult)] != previousfinalresult:
					print('\n' + finalresult[finalresult.find('> · ') + 4:len(finalresult)])
					previousfinalresult = finalresult[finalresult.find('> · ') + 4:len(finalresult)]
?