Guys, I need help, something I already have a solid amount of time I can not get off the ground.
I do pribludinu for the races of the Yurls that interest me. I use multiprocessing. Ideally, maximize page load per second.
The essence is as follows (the code will be lower): 1. I fill in the queue with tasks (Task object) 2. check the page for all sorts of conditions there 3. save it to a local disk in the .txt extension
Falling out with an error: objc [39553]: + [___ NSPlaceholderDate initialize] objc [39553]: + [__ NSPlaceholderDate initialize] may have been in progress. We can safely stop the process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.
Question: 1. Can I somehow install multiprocessing.set_start_method ('spawn') when implementing the Worker class? 2. Will it help? 3. If this does not help in which direction to google?
ps I would be grateful for material on the subject of my task and whether there are any references to a similar problem.
class Worker(multiprocessing.Process): def __init__(self, task_queue, result_queue): multiprocessing.Process.__init__(self, daemon = True) self.task_queue = task_queue self.result_queue = result_queue def run(self): proc_name = self.name massageToError = '' while not self.task_queue.empty(): next_task = self.task_queue.get() strId = next_task.stringId strUrl = next_task.stringUrl session = next_task.session if next_task is None: self.task_queue.task_done() self.result_queue.put(proc_name + ' task - none!') globalBar.update(1) break should_save = True try: if self.checkUrl(strUrl): req = session.get(strUrl, verify = False, timeout = (3, 15)) if req.status_code == 200: local_encod = req.encoding soup = BeautifulSoup(req.content , 'html.parser', from_encoding = local_encod) soup = self.handler(soup) if should_save: try: with open(pathToFolder + '/' + str(strId) + '.txt', 'w', encoding = 'utf-8') as writer: writer.write(soup) except Exception as exeptInDump: strToWrite = 'DumpFileFync: ' + strId + '\t' + str(exeptInDump) self.errorWriter(strToWrite) finally: pass else: pass except Exception as exceptInWorker: massageToError = 'WorkeDefError: ' + strId + '\t' + str(exceptInWorker) self.errorWriter(massageToError) finally: self.task_queue.task_done() if len(massageToError) > 0 : self.result_queue.put(massageToError) else: self.result_queue.put(str(proc_name) + ' - done') globalBar.update(1) return def errorWriter(self, strMassage): # here set up file path path_to_error = '/Users/uhntiss/WORK/Python_Project/loader_v1/errors.txt' #lock.acquire() try: with open(path_to_error, encoding = 'utf-8', mode = 'a') as writer: writer.write(strMassage + '\n') except Exception as errorExept: print('error in error writer!!!!!') finally: #lock.release() pass def checkUrl(self, stringUrl): matchList = ['.zip', '.pdf', '.doc', '.img', 'jpg', 'javascript:;', '.gzip', '.gzp', '.tif', '.docx', '.rtf', '.txt ', '.wks', '.wps', '.wpd', '.odt', '.gif', '.tiff'] for el in matchList: if el in stringUrl: return True return False def handler(self, soup): delete_list = ['style', 'script', 'img', 'input'] for elem in soup.find_all(delete_list): elem.decompose() return re.sub(' +', ' ', soup.get_text(" ", strip = True)) if __name__ == "__main__": input(' - Pres any key for continue.') try: multiprocessing.get_context('spawn') except RuntimeError: pass print('--------------start---------------------') # Establish communication queues tasks = JoinableQueue() results = Queue() #end #crate Session object sessionToWorker = getSession() #Create queue tasks loadList = {} try: with open(filePath, 'r', encoding = 'utf-8') as reader: for line in reader: key = re.sub('\n', '', line.split('\t')[2]) strUrl = re.sub('\n', '', line.split('\t')[1]) loadList[key] = strUrl except Exception as identifier: print('Somthing went wrong in file:') print('error description here:') print('------ ERROR ------') print(str(identifier)) quit() #create task for key in tqdm(loadList): strUrlToTask = str(loadList.get(key)) fileNameToTask = str(key) tasks.put(Task(fileNameToTask, strUrlToTask, sessionToWorker)) del strUrlToTask, fileNameToTask print('----------------------------------------') time.sleep(1) globalBar = tqdm(total = len(loadList)) #init workers num_worker = multiprocessing.cpu_count()-1 # cpu_count * 2! workers = [Worker(tasks, results) for i in range(num_worker) ] try: for worker in workers: worker.start() finally: pass time.sleep(1) tasks.join() globalBar.close() num_jobs = int(results.qsize()) while num_jobs: result = results.get() resultWriter(str(result)) num_jobs -= 1 del sessionToWorker, loadList quit()