recursive url parser

Delightful Dugong answered on July 10, 2024 Popularity 1/10 Helpfulness 1/10

answer recursive url parser

recursive url parser

Comment

Tip Delightful Dugong 1 GREPCC

import asyncio
import aiohttp
import urllib.parse
from bs4 import BeautifulSoup

main_queue = asyncio.Queue()
parsed_links_queue = asyncio.Queue()
parsed_links = set()

session = None
f_out = None
visited_urls = 0

async def get_url(url):
    global visited_urls
    try:
        async with session.get(url) as resp:
            visited_urls += 1
            return await resp.text()
    except:
        print(f'Bad URL: {url}')

async def worker():
    while True:
        url = await main_queue.get()
        soup = BeautifulSoup(await get_url(url), 'html.parser')

        for a in soup.select('a[href]'):
            href = a['href']
            if href.startswith('/wiki/') and ':' not in href:
                parsed_links_queue.put_nowait('https://es.wikipedia.org' + href)

        main_queue.task_done()

async def consumer():
    while True:
        url = await parsed_links_queue.get()

        if url not in parsed_links:
            print(urllib.parse.unquote(url), file=f_out, flush=True)  # <-- print the url to file
            parsed_links.add(url)
            main_queue.put_nowait(url)

        parsed_links_queue.task_done()


async def main():
    global session, f_out

    seed_url = 'https://es.wikipedia.org/wiki/Olula_del_R%C3%ADo'
    parsed_links.add(seed_url)

    with open('out.txt', 'w') as f_out:
        async with aiohttp.ClientSession() as session:
            workers = {asyncio.create_task(worker()) for _ in range(16)}
            c = asyncio.create_task(consumer())

            main_queue.put_nowait(seed_url)
            print('Initializing...')
            await asyncio.sleep(5)

            while main_queue.qsize():
                print(f'Visited URLs: {visited_urls:>7}  Known URLs (saved in out.txt): {len(parsed_links):>7}', flush=True)
                await asyncio.sleep(0.1)

    await main_queue.join()
    await parsed_links_queue.join()

asyncio.run(main())

xxxxxxxxxx

import asyncio

import aiohttp

import urllib.parse

from bs4 import BeautifulSoup

main_queue = asyncio.Queue()

parsed_links_queue = asyncio.Queue()

parsed_links = set()

session = None

f_out = None

visited_urls = 0

async def get_url(url):

    global visited_urls

    try:

        async with session.get(url) as resp:

            visited_urls += 1

            return await resp.text()

    except:

        print(f'Bad URL: {url}')

async def worker():

    while True:

        url = await main_queue.get()

        soup = BeautifulSoup(await get_url(url), 'html.parser')

        for a in soup.select('a[href]'):

            href = a['href']

            if href.startswith('/wiki/') and ':' not in href:

                parsed_links_queue.put_nowait('https://es.wikipedia.org' + href)

        main_queue.task_done()

async def consumer():

    while True:

        url = await parsed_links_queue.get()

        if url not in parsed_links:

            print(urllib.parse.unquote(url), file=f_out, flush=True)  # <-- print the url to file

            parsed_links.add(url)

            main_queue.put_nowait(url)

        parsed_links_queue.task_done()

async def main():

    global session, f_out

    seed_url = 'https://es.wikipedia.org/wiki/Olula_del_R%C3%ADo'

    parsed_links.add(seed_url)

    with open('out.txt', 'w') as f_out:

        async with aiohttp.ClientSession() as session:

            workers = {asyncio.create_task(worker()) for _ in range(16)}

            c = asyncio.create_task(consumer())

            main_queue.put_nowait(seed_url)

            print('Initializing...')

            await asyncio.sleep(5)

            while main_queue.qsize():

                print(f'Visited URLs: {visited_urls:>7}  Known URLs (saved in out.txt): {len(parsed_links):>7}', flush=True)

                await asyncio.sleep(0.1)

    await main_queue.join()

    await parsed_links_queue.join()

asyncio.run(main())

Popularity 1/10 Helpfulness 1/10 Language whatever

Source: stackoverflow.com

Tags: url whatever

Link to this answer
Share Copy Link

Contributed on Jul 10 2024

Delightful Dugong

0 Answers Avg Quality 2/10

recursive url parser

Contents

More Related Answers

recursive url parser

Grepper

Documentation

Social

Legal

Contact

Oops, You will need to install Grepper and log-in to perform this action.