V2EX = way to explore
V2EX 是一个关于分享和探索的地方
现在注册
已注册用户请  登录
V2EX 提问指南
William55555
V2EX  ›  问与答

使用 selenium+selenium 尝试抓取天猫的 cookie,在 windows 环境下抓取无异常,但部署到 Linux 服务器上时, chromedriver.exe 残留进程一直未被回收,请诸位大神给分析下原因。

  •  
  •   William55555 · 2018-06-11 12:35:45 +08:00 · 2085 次点击
    这是一个创建于 2404 天前的主题,其中的信息可能已经有所发展或是发生改变。

    #coding:utf-8 import random import re import os import time import threading import Queue import traceback import logging from threading import Thread from selenium import webdriver

    lock = threading.Lock()

    def async(f): """异步装饰器""" def wrapper(*args, **kwargs): thr = Thread(target=f, args=args, kwargs=kwargs) thr.start() return wrapper

    class TmallCookie(object): def init(self): # cookie 队列 self.cookie_queue = Queue.Queue() self.cookie_list = list() self.load_cookie() self.parse_cookie() self.save_cookie()

    def load_cookie(self):
        """加载本地已保存的 cookie"""
        lock.acquire()
        with open("tmall_cookie.txt", "r") as f:
            cookie_list = f.readlines()
        lock.release()
        for i in cookie_list:
            self.cookie_queue.put(i.strip())
    
    @async
    def parse_cookie(self):
        """
        请求 cookie,并将 cookie 保存至 cookie 列表
        :return: 
        """
        urls = ['https://detail.tmall.com/item.htm?id=562345301295',
                'https://detail.tmall.com/item.htm?id=553941537843',
                'https://detail.tmall.com/item.htm?id=558646979307',
                'https://list.tmall.com/search_product.htm?spm=a221t.1812074.2005984841.8.44d84208RXceJT&q=%B9%E2%C3'
                '%E6%CE%C4%D0%D8&from=.list.pc_1_searchbutton&acm=lb-zebra-7777-1443323.1003.4.1158540&type=p&scm=100'
                '3.4.lb-zebra-7777-1443323.OTHER_14748278648600_1158540',
                'https://list.tmall.com/spu_detail.htm?fmtab=sp&cat=50105508&spuid=877471268&suid=4e5fd39570486fdf2a'
                '9b3077572be7ab&rn=1e0abfcf6995e918ab6c7bc00d6e9be2'
                ]
        option = webdriver.ChromeOptions()
        option.add_argument('disable-infobars')
        option.add_argument('disable-gpu')
        option.add_argument('--headless')
        option.add_argument("--no-sandbox")
        # option.add_argument("window-size=1024,768")
        while True:
            if self.cookie_queue.qsize() < 20000:
                try:
                    # driver = webdriver.Chrome('C:\\chromedriver.exe', chrome_options=option)
                    driver = webdriver.Chrome('./chromedriver', chrome_options=option)
                    driver.set_page_load_timeout(120)
                    url = random.choice(urls)
                    driver.get(url)
                    time.sleep(5)
                    try:
                        cookies = driver.get_cookies()
                        cookie_string = []
                        for cookie_info in cookies:
                            cookie_string.append(u'%s=%s' % (cookie_info.get(u'name'), cookie_info.get(u'value')))
                        cookie_string = '; '.join(cookie_string)
                        driver.close()
                        driver.quit()
                    except Exception as e:
                        pass
                    try:
                        _tb_token_ = re.findall("(_tb_token_=.*?;)", cookie_string)[0]
                        t = re.findall("(t=[a-z0-9]+)", cookie_string)[0]
                        cna = re.findall("(cna=.*?;)", cookie_string)[0]
                        cookie2 = re.findall("(cookie2=.*?;)", cookie_string)[0].replace(";", "")
                        cookie = _tb_token_ + " " + t + "; " + cna + " " + cookie2
                        try:
                            enc = re.findall("(enc=.*?;)", cookie_string)[0]
                            cookie = _tb_token_ + " " + t + "; " + cna + " " + enc + " " + cookie2
                        except:
                            pass
                        print cookie
                        self.cookie_queue.put(cookie)
                        lock.acquire()
                        self.cookie_list.append(cookie)
                        lock.release()
                    except Exception as e:
                        pass
                except Exception as e:
                    print traceback.format_exc()
            else:
                time.sleep(300)
    
    @async
    def save_cookie(self):
        """
        清空之前的 cookie 文件,将当天抓取的 cookie 保存至文件
        :return: 
        """
        while True:
            time.sleep(1)
            if len(self.cookie_list) > 10:
                lock.acquire()
                # with open("tmall_cookie.txt", "w") as f1:
                #     f1.truncate()
                #     time.sleep(5)
                with open("tmall_cookie.txt", "a") as f2:
                    for cookie in self.cookie_list:
                        f2.write(cookie)
                        f2.write("\n")
                    self.cookie_list = []
                lock.release()
    
    def get_cookie(self):
        """
        获取一个 cookie
        :return: tmall cookie
        """
        while True:
            try:
                cookie = self.cookie_queue.get(timeout=5)
                break
            except Exception as e:
                logging.warning("Get cookie error: %s" % e)
                time.sleep(5)
        if self.cookie_queue.qsize() <= 5000:
            self.cookie_queue.put(cookie)
        return cookie
    

    if name == 'main': cookie = TmallCookie() # while True: # print cookie.get_cookie() # time.sleep(2)

    2 条回复    2018-07-07 17:31:43 +08:00
    a7a2
        1
    a7a2  
       2018-06-11 13:31:04 +08:00
    按照经验不是你代码的问题 而是你用的 webdriver 库跟 chromedriver 之间的问题 同样在 macOS 下也有这个问题
    可以尝试调用 kill 之类结束它,就是自己管理
    1109599636
        2
    1109599636  
       2018-07-07 17:31:43 +08:00
    我以前写的时候是换的火狐的驱动....
    关于   ·   帮助文档   ·   博客   ·   API   ·   FAQ   ·   实用小工具   ·   2970 人在线   最高记录 6679   ·     Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 · 22ms · UTC 14:44 · PVG 22:44 · LAX 06:44 · JFK 09:44
    Developed with CodeLauncher
    ♥ Do have faith in what you're doing.