Python Read huge file line per line and send it to multiprocessing or thread

--------------------------------------------------
Rise to the top 3% as a developer or hire one of them at Toptal: https://topt.al/25cXVn
--------------------------------------------------

Music by Eric Matyas
https://www.soundimage.org
Track title: Lost Civilization

--

Chapters
00:00 Python Read Huge File Line Per Line And Send It To Multiprocessing Or Thread
01:24 Accepted Answer Score 4
02:43 Answer 2 Score 1
04:47 Thank you

--

Full question
https://stackoverflow.com/questions/7182...

--

Content licensed under CC BY-SA
https://meta.stackexchange.com/help/lice...

--

Tags
#python #multiprocessing #readfile

#avk47

ACCEPTED ANSWER

Score 4

Although the problem seems unrealistic though. shooting 737,022,387 requests! calculate how many months it'll take from single computer!!

Still, Better way to do this task is to read line by line from file in a separate thread and insert into a queue. And then multi-process the queue.

Solution 1:

from multiprocessing import Queue, Process
from threading import Thread
from time import sleep

urls_queue = Queue()
max_process = 4

def read_urls():
    with open('urls_file.txt', 'r') as f:
        for url in f:
            urls_queue.put(url.strip())
            print('put url: {}'.format(url.strip()))

    # put DONE to tell send_request_processor to exit
    for i in range(max_process):
        urls_queue.put("DONE")


def send_request(url):
    print('send request: {}'.format(url))
    sleep(1)
    print('recv response: {}'.format(url))


def send_request_processor():
    print('start send request processor')
    while True:
        url = urls_queue.get()
        if url == "DONE":
            break
        else:
            send_request(url)


def main():
    file_reader_thread = Thread(target=read_urls)
    file_reader_thread.start()

    procs = []
    for i in range(max_process):
        p = Process(target=send_request_processor)
        procs.append(p)
        p.start()

    for p in procs:
        p.join()

    print('all done')
    # wait for all tasks in the queue
    file_reader_thread.join()


if __name__ == '__main__':
    main()

Demo: https://onlinegdb.com/Elfo5bGFz

Solution 2:

You can use tornado asynchronous networking library

from tornado import gen
from tornado.ioloop import IOLoop
from tornado.queues import Queue

q = Queue(maxsize=2)

async def consumer():
    async for item in q:
        try:
            print('Doing work on %s' % item)
            await gen.sleep(0.01)
        finally:
            q.task_done()

async def producer():
    with open('urls_file.txt', 'r') as f:
        for url in f:
            await q.put(url)
            print('Put %s' % item)

async def main():
    # Start consumer without waiting (since it never finishes).
    IOLoop.current().spawn_callback(consumer)
    await producer()     # Wait for producer to put all tasks.
    await q.join()       # Wait for consumer to finish all tasks.
    print('Done')
    # producer and consumer can run in parallel

IOLoop.current().run_sync(main)

ANSWER 2

Score 1

Using method multiprocessing.pool.imap is a step in the right direction but the problem is that with so much input you will be feeding the input task queue faster than the processing pool can take the tasks off the queue and return results. Consequently, the task queue will continue to grow and you will exhaust memory. What is needed is a way to "throttle" method imap so that it blocks once the task queue size has N tasks on it. I think a reasonable value for N as a default is twice the pool size to ensure that when a pool process completes work on a task there will be no delay for it to find another task to work on. Hence we create classes BoundedQueueProcessPool (multiprocessing) and BoundedQueueThreadPool (multithreading):

import multiprocessing.pool
import multiprocessing
import threading


class ImapResult():
    def __init__(self, semaphore, result):
        self._semaphore = semaphore
        self.it = result.__iter__()

    def __iter__(self):
        return self

    def __next__(self):
        try:
            elem = self.it.__next__()
            self._semaphore.release()
            return elem
        except StopIteration:
            raise
        except:
            self._semaphore.release()
            raise

class BoundedQueuePool:
    def __init__(self, limit, semaphore):
        self._limit = limit
        self._semaphore = semaphore

    def release(self, result, callback=None):
        self._semaphore.release()
        if callback:
            callback(result)

    def apply_async(self, func, args=(), kwds={}, callback=None, error_callback=None):
        self._semaphore.acquire()
        callback_fn = self.release if callback is None else lambda result: self.release(result, callback=callback)
        error_callback_fn = self.release if error_callback is None else lambda result: self.release(result, callback=callback)
        return super().apply_async(func, args, kwds, callback=callback_fn, error_callback=error_callback_fn)

    def imap(self, func, iterable, chunksize=1):
        def new_iterable(iterable):
            for elem in iterable:
                self._semaphore.acquire()
                yield elem
        if chunksize > self._limit:
            raise ValueError(f'chunksize argument exceeds {self._limit}')
        result = super().imap(func, new_iterable(iterable), chunksize)
        return ImapResult(self._semaphore, result)

    def imap_unordered(self, func, iterable, chunksize=1):
        def new_iterable(iterable):
            for elem in iterable:
                self._semaphore.acquire()
                yield elem
        if chunksize > self._limit:
            raise ValueError(f'chunksize argument exceeds {self._limit}')
        result = super().imap_unordered(func, new_iterable(iterable), chunksize)
        return ImapResult(self._semaphore, result)

class BoundedQueueProcessPool(BoundedQueuePool, multiprocessing.pool.Pool):
    def __init__(self, *args, max_waiting_tasks=None, **kwargs):
        multiprocessing.pool.Pool.__init__(self, *args, **kwargs)
        if max_waiting_tasks is None:
            max_waiting_tasks = self._processes
        elif max_waiting_tasks < 0:
            raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
        limit = self._processes + max_waiting_tasks
        BoundedQueuePool.__init__(self, limit, multiprocessing.BoundedSemaphore(limit))

class BoundedQueueThreadPool(BoundedQueuePool, multiprocessing.pool.ThreadPool):
    def __init__(self, *args, max_waiting_tasks=None, **kwargs):
        multiprocessing.pool.ThreadPool.__init__(self, *args, **kwargs)
        if max_waiting_tasks is None:
            max_waiting_tasks = self._processes
        elif max_waiting_tasks < 0:
            raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
        limit = self._processes + max_waiting_tasks
        BoundedQueuePool.__init__(self, limit, threading.BoundedSemaphore(limit))


#######################################################################

from time import sleep


def process_line(line):
    sleep(3)
    # the lines already have line end characters:
    print(line, end='')
    return True

if __name__ == "__main__":
    pool = BoundedQueueProcessPool(2)
    with open("test.txt") as file:
        for res in pool.imap(process_line, file):
            #print(res)
            pass
    pool.close()
    pool.join()