forked from johneliades/code_crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcode_crawler.py
More file actions
executable file
·107 lines (92 loc) · 3.15 KB
/
code_crawler.py
File metadata and controls
executable file
·107 lines (92 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/python3
from bs4 import BeautifulSoup
from googlesearch import search
from pygments import lexers, highlight
from pygments.formatters import TerminalFormatter
import urllib3
import sys
import random
import os
import certifi
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
class bcolors:
CYAN = '\033[96m'
RED = '\033[31m'
ENDC = '\033[0m'
BOLD = '\033[1m'
available_sites = ["w3schools", "stackoverflow", "tutorialspoint",
"geeksforgeeks", "pypi", "askubuntu", "mathworks",
"stackexchange", "unrealengine", "microsoft"]
languages = ["c", "java", "python", "lua", "javascript", "js" ,"go",
"golang", "cpp", "c++", "matlab", "ruby", "c#", "csharp",
"css", "html", "latex"]
try:
query = sys.argv[1]
if("\"" not in query and "'" not in query):
query = ' '.join(sys.argv[1:])
except:
query = input("Give search query: ")
print()
num_results = 7
http = urllib3.PoolManager(ca_certs=certifi.where(), cert_reqs='REQUIRED')
total_results = []
for url in search(query, tld="com", lang='en', num=num_results, stop=num_results, pause=random.uniform(0, 0.5)):
site = [x for x in available_sites if url.find(x)!=-1]
if(len(site)!=0):
site = site[0]
else:
continue
if(site in available_sites):
response = http.request('GET', url)
soup = BeautifulSoup(response.data, features="html.parser")
try:
if site == "w3schools":
result = soup.find("div", {"class": "w3-code"})
result = result.get_text(separator="\n").strip()
elif site == "stackoverflow" or site == "askubuntu" or site == "stackexchange":
result = soup.find("div", {'class': ['answer', 'accepted-answer']})
result = result.find("div", {"class": "answercell"})
result = result.find("div", {"class": "s-prose"})
result = result.find("pre").text
elif site == "tutorialspoint":
result = soup.find("div", {"class": "tutorial-content"})
result = result.find("pre").text
elif site == "geeksforgeeks":
result = soup.find("td", {"class": "code"})
results = result.find_all(class_="line")
result = ""
for line in results:
result += line.getText() + "\n"
elif site == "pypi":
result = soup.find("span", id="pip-command")
result = result.get_text().strip()
elif site == "mathworks":
result = soup.find("div", {"class": "codeinput"})
result = result.find("pre").text
elif site == "unrealengine":
result = soup.find("div", {'class': ['answer', 'accepted-answer']})
result = result.find("div", {"class": "answer-body"})
result = result.find("pre").text
elif site == "microsoft":
result = soup.find("code")
result = result.get_text().strip()
result = result.strip() + "\n"
if result not in total_results:
total_results.append(result)
else:
continue
except:
continue
print(bcolors.CYAN + bcolors.BOLD + site + ": " + bcolors.RED + url + bcolors.ENDC)
for i in range(min(80, len(site + ": " + url))):
print(u'\u2501', end="")
print()
lexer = None
for language in languages:
if(language.lower() in query.lower().split(' ')):
lexer = lexers.get_lexer_by_name(language)
break
if(lexer!=None):
print(highlight(result, lexer, TerminalFormatter()))
else:
print(result)