Projects

Algorithms

Easy Linear Search This is the simplest searching algorithm. It sequentially checks each element of the list until a match is found or the whole list has been searched. def linear_search(arr, x): for i in range(len(arr)): if arr[i] == x: return i return -1 Binary Search A more efficient searching algorithm than linear search, but it requires the list to be sorted. It starts by checking the middle element of a list and divides the search space in half at each step until it finds the target element or exhausts the search space. ...

Hugo Tutorial

Hugo is a powerful and fast static site generator written in Go. It’s perfect for blogs, portfolios, company websites, and much more. Prerequisites Before you begin, make sure you have the following installed: Git Hugo Step 1: Installing Hugo To install Hugo on your system, use the following commands based on your operating system: macOS brew install hugo Windows (with Chocolatey) choco install hugo -confirm Linux (with snap) sudo snap install hugo Step 2: Creating a New Site Once Hugo is installed, you can create a new site using the following command: ...

大众点评爬虫教程

安装环境 import time import pandas as pd from sqlalchemy import create_engine from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options Step 1: 对浏览器进行设置 chrome_options = Options() chrome_options.add_argument('--incognito') ##设置无痕模式 chrome_options.add_argument("--disable-blink-features=AutomationControlled") service = Service('/Users/skg/Desktop/test/chromedriver') driver = webdriver.Chrome(service=service, options=chrome_options) script = ''' Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) ''' driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": script}) driver.maximize_window() Step 2: 设置定位xpath信息 next_page_xp = "//div[@class='page']//a[@title='下一页']" shop_lis_xp = "//div[@id='shop-all-list']//ul/li//div[@class='tit']/a" shop_title_xp = "//h1[@class='shop-name']" shop_score_xp = "//div[contains(@class,'score score')]" shop_reviewcount_xp = "//span[@id='reviewCount']|//span[@class='reviews']" shop_avgpricetitle_xp = "//span[@id='avgPriceTitle']|//span[@class='price']" shop_kouwei_xp = "//span[contains(text(),'口味')]" shop_huanjing_xp = "//span[contains(text(),'环境')]" shop_fuwu_xp = "//span[contains(text(),'服务')]" shop_adress_xp = "//span[@id='address']|//div[contains(text(),'地址')]" shop_tel_xp = "//p[@class='expand-info tel']|//div[contains(text(),'电话')]" reviews_block_xp = "//div[@class='reviews-items']/ul/li" reviews_zhankai_xp = ".//a[@class='fold' and contains(@data-click-name,'展开评价')]" reviews_body_xp = ".//div[@class='review-words']" reviews_time_xp = ".//span[@class='time']" reviews_nextpage_xp = "//div[@class='reviews-pages']//a[@class='NextPage']" Step 3: 参数设置 # 登录页访问 driver.get("https://account.dianping.com/pclogin") # 目标类目店铺列表页访问 driver.get("https://www.dianping.com/shanghai/ch10/g110") ## 设置抓取店铺列表页码数量 shop_lis_page_num = 2 ## 设置抓取店铺评论列表页码数量 reviews_page_num = 10 Step 4: 获取店铺主页链接 shop_set = set() for i in range(shop_lis_page_num): try: driver.find_element(By.XPATH,next_page_xp).click() time.sleep(2) shop_eles = driver.find_elements(By.XPATH,shop_lis_xp) for ele in shop_eles: shop_url = ele.get_attribute('href') if 'brands' in shop_url: continue shop_set.add(shop_url) except: pass Step 5: 循环访问店铺主页并存储数据 df_all = pd.DataFrame() for li in shop_set: url = li + '/review_all' # 设置数据库链接 conn = create_engine("mysql+pymysql://root:1711785634@localhost/test") try: driver.get(url) time.sleep(3) except: pass try: driver.find_element(By.XPATH,"//h1[contains(text(),'Forbidden')]") print("被限制") #driver.quit() break except: pass for rev_page in range(reviews_page_num): current_url = driver.current_url try: shop_title = driver.find_element(By.XPATH,shop_title_xp).get_attribute('textContent').strip().split(' ')[0].strip() except: shop_title = '' try: shop_score = driver.find_element(By.XPATH,shop_score_xp).get_attribute('textContent').strip() except: shop_score = '' try: shop_reviewcount = driver.find_element(By.XPATH,shop_reviewcount_xp).get_attribute('textContent').replace('条评价','').strip() except: shop_reviewcount = '' try: shop_avgpricetitle = driver.find_element(By.XPATH,shop_avgpricetitle_xp).get_attribute('textContent').replace('人均：','').replace('元','').strip() except: shop_avgpricetitle = '' try: shop_kouwei = driver.find_element(By.XPATH,shop_kouwei_xp).get_attribute('textContent').replace('口味：','').strip() except: shop_kouwei = '' try: shop_huanjing = driver.find_element(By.XPATH,shop_huanjing_xp).get_attribute('textContent').replace('环境：','').strip() except: shop_huanjing = '' try: shop_fuwu = driver.find_element(By.XPATH,shop_fuwu_xp).get_attribute('textContent').replace('服务：','').strip() except: shop_fuwu = '' try: shop_adress = driver.find_element(By.XPATH,shop_adress_xp).get_attribute('textContent').replace('地址:','').strip() except: shop_adress = '' try: shop_tel = driver.find_element(By.XPATH,shop_tel_xp).get_attribute('textContent').replace('电话:','').strip() except: shop_tel = '' reviews_block_eles = driver.find_elements(By.XPATH,reviews_block_xp) for reviews_ele in reviews_block_eles: try: reviews_ele.find_element(By.XPATH,reviews_zhankai_xp).click() except: pass time.sleep(0.5) try: reviews_body = reviews_ele.find_element(By.XPATH,reviews_body_xp).get_attribute('textContent').replace('收起评价','').strip() except: reviews_body = '' try: reviews_time = reviews_ele.find_element(By.XPATH,reviews_time_xp).get_attribute('textContent').strip().split('更新于')[0].strip() except: reviews_time = '' df = pd.DataFrame({"shop_url":[li],"shop_title":[shop_title],"shop_score":[shop_score],"shop_reviewcount":[shop_reviewcount],"shop_avgpricetitle":[shop_avgpricetitle], "shop_kouwei":[shop_kouwei],"shop_huanjing":[shop_huanjing],"shop_fuwu":[shop_fuwu],"shop_adress":[shop_adress],"shop_tel":[shop_tel], "reviews_body":[reviews_body],"reviews_time":[reviews_time],"current_url":[current_url] }) # 数据同步到mysql数据库 df.to_sql(con=conn,name='dazdp_data',if_exists='append',index=False) ## 数据储存到pandas数据帧，结束后导出到excel df_all = pd.concat([df_all,df]) try: driver.find_element(By.XPATH,reviews_nextpage_xp).click() time.sleep(1) except: pass time.sleep(1) try: driver.find_element(By.XPATH,"//h1[contains(text(),'Forbidden')]") print("被限制") #driver.quit() break except: pass Step 6: 导出为Excel df_all.to_excel("大众点评数据.xlsx",index=False)