项目作者: hsayed21

项目描述 :
Search Engine
高级语言: Python
项目地址: git://github.com/hsayed21/Search-Engine.git
创建时间: 2020-04-22T14:02:20Z
项目社区:https://github.com/hsayed21/Search-Engine

开源协议:

下载


Search Engine

Project in [ Language Engineering ] FCS Level 3


Install Libraries

  1. pip install b4
  2. nltk.download('punkt')
  3. #nltk.download('wordnet')
  4. #nltk.download('stopwords')
Libraries Used
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import re
  4. import nltk
  5. from nltk.corpus import wordnet
  6. from nltk.corpus import stopwords

Search by using google query

  1. query = input("Enter word to search: ")
  2. url = f"https://google.com/search?q={query}"
  3. res = requests.get(url, headers={"User-Agent":"Mozilla/0.5"})

Extract HTML Content and all divs that contain title and description , link by class of div

  1. soup = BeautifulSoup(res.content, "html.parser")
  2. result_div = soup.find_all('div', attrs = {'class': 'ZINbbc'})
  1. links = [ ] # Store Links
  2. titles = [ ] # Store Titles
  3. descriptions = [ ] # Store Descriptions
  1. for r in result_div:
  2. try:
  3. link = r.find('a', href = True)
  4. title = r.find('div', attrs={'class':'vvjwJb'}).get_text()
  5. description = r.find('div', attrs={'class':'s3v9rd'}).get_text()
  6. # Check if extract empty data else store to arrays
  7. if link != '' and title != '' and description != '':
  8. links.append(link['href'])
  9. titles.append(title)
  10. descriptions.append(description)
  11. except:
  12. continue
  1. to_remove = [ ]
  2. clean_links = [ ]
  1. for i, l in enumerate(links): # enumerate return index and value
  2. # make sure that file is a link by start with /url\?q
  3. clean = re.search('\/url\?q\=(.*)\&sa',l)
  4. # if return none value means not a useful link
  5. if clean is None:
  6. to_remove.append(i)
  7. continue
  8. clean_links.append(clean.group(1))
  9. # remove title and descript of none links from arrays
  10. for x in to_remove:
  11. del titles[x]
  12. del descriptions[x]

function sort tuple by second value or any determine by ind

  1. def Sort_Tuple(tup,ind):
  2. tup.sort(key = lambda x: x[ind])
  3. return tup
  1. List_all_rank = [ ] # store rank by page
  2. def lesk(query, sentence,ind):
  3. Text1 = sentence.lower() # string to lowercase
  4. words = nltk.word_tokenize(Text1)
  5. stop_words = stopwords.words("english")
  6. stop_words += ['can', 'will', 'use', 'one', 'using', 'used', 'also', 'see', 'first', 'like']
  7. stop_words += ['page', 'get', 'new', 'two', 'site', 'blog', 'many', 'may' ,"don't", 'dont', 'way']
  8. stop_words += ['last', 'best', 'able', 'even', 'next', 'last', 'let', "none", 'every', 'three']
  9. stop_words += ['lot', 'well', 'chart', 'much', 'based', 'important', 'posts', 'reads', 'least']
  10. stop_words += ['still', 'follow', 'called', 'and','this', 'that', 'there', 'as','the', 'is']
  11. stop_words += ['/', '=', '.', ',', '.']
  12. filtered_words = []
  13. for word in words:
  14. if word not in stop_words:
  15. filtered_words.append(word)
  16. word = query
  17. len_syn = len(wordnet.synsets(word)) # length all synsets
  18. da=[ ] # store definition to avoid repetition
  19. ea=[ ] # store examples to avoid repetition
  20. List_rank = [ ] # store rank by synsets
  21. # How many words description is mentioned in each synset
  22. # at the end get max and store in List_all_rank = [ ]
  23. for x in range(len_syn):
  24. for i in filtered_words:
  25. synset = wordnet.synsets(word)[x]
  26. if i in synset.definition().lower():
  27. if i in da:
  28. pass
  29. else:
  30. da.append(i)
  31. if len(synset.examples()) == 0:
  32. pass
  33. else:
  34. if i in synset.examples()[0].lower() and word in synset.examples()[0].lower():
  35. if i in ea:
  36. pass
  37. else:
  38. ea.append(i)
  39. # create array and add first item number of synset
  40. l = []
  41. l.insert(0,x)
  42. r = len(da)+len(ea) # collection of description word found in each synset
  43. l.append(r)
  44. y2 = tuple(l) # convert list to tuple
  45. List_rank.append(y2)
  46. Sorted_list_rank = Sort_Tuple(List_rank,1) #sort by second item [number of words]
  47. l = list(Sorted_list_rank[-1]) # after sort get last tuple ,Express the greatest value then convert to list
  48. l.insert(0,ind) # add first item number of page
  49. y2 = tuple(l) # convert tuple again
  50. List_all_rank.append(y2)
  51. for idx, val in enumerate(descriptions):
  52. lesk(query, val, idx)
  53. s_li = Sort_Tuple(List_all_rank,2) # (numOf page, numOf syn, how many words) here sort by how many words
  54. for x in range(len(List_all_rank),0,-1):
  55. print("Page Number",s_li[x-1][0]+1, ", Most Synset Num",s_li[x-1][1],", With Rank",s_li[x-1][2])
  56. print("Title:",titles[s_li[x-1][0]])
  57. print("Link:",clean_links[s_li[x-1][0]])
  58. print("-------------------------\n")