The real way to check the forum

Message

#1 Post by **Nite Coder** » Mon Aug 05, 2019 7:35 pm

So I decided that it wasn't the Linux way to open a web browser and go the MX Linux forum to check to see the newest posts. I thought it would be much better in the terminal. So I wrote a python script to get the results for me!

Code: Select all

#!/usr/bin/python3

from bs4 import BeautifulSoup
import requests
import sys
import os
import time

def get_data():
    html = requests.get('https://forum.mxlinux.org')
    parser = BeautifulSoup(html.text, 'html.parser')
    top = []
    for a in parser.find_all('ul', { 'class' : 'topiclist' }):
        for b in a.find_all('li', { 'class' : 'row' }):
            for c in b.find_all('div', { 'class' : 'list-inner' }):
                for d in c.find_all('li'):
                    for anchors in d.find_all('a', { 'class' : 'topictitle' }):
                        top.append(anchors.text.strip())
    return top

def main(argv):
    if len(argv) > 1:
        if argv[1] == '-b':
            while True:
                data = get_data()
                dis = ''
                for d in data:
                    dis += d + '\n'
                os.system('notify-send \'' + dis + '\'') 
                time.sleep(255)
        else:
            for data in get_data():
                print(data)
    else:
        for data in get_data():
            print(data)

try:
    main(sys.argv)
except:
    print('Error thrown... Exiting...')

Does require Beautiful Soup.

Code: Select all

sudo apt install python3-bs4

Coming soon... Using lynx to post on the MX Forum.

#2 Post by **JayM** » Mon Aug 05, 2019 8:00 pm

Don't forget Unanswered Topics.

#3 Post by **Adrian** » Mon Aug 05, 2019 8:09 pm

This is nice! I would like to be able to get results with my account, especially get the "unread posts" search.php?search_id=unreadposts

#4 Post by **Nite Coder** » Thu Aug 08, 2019 10:43 am

So I made some changes and now you can choose which page you want to scrape and which section you want to get. Being able to do at as your user is having some issues at the moment. Once I login I get the wrong page. But you can scrape unanswered topics and I'm not giving up on the user login thing.

If you don't want to copy it below https://github.com/TheNiteCoder/mx-forum-scraper and the file you want is request.py

Code: Select all

#!/usr/bin/python3

import requests
from bs4 import BeautifulSoup
import urllib.parse as urlparse
from urllib.parse import urlencode

forum = 'https://forum.mxlinux.org/'

def get_html(url='', password=None, username=None):
    if password == None or username == None:
        return requests.get(url).text
    session = requests.Session()
    headers = {'User-Agent' : 'Mozilla/5.0'}
    payload = {'username': username, 'password': password, 'redirect':'index.php' , 'login':'Login'}
    r = session.post(forum + "ucp.php?mode=login", headers=headers, data=payload)
    sidStart = r.text.find("sid")+4
    sid = r.text[sidStart:sidStart+32]
    parameters = {'mode': 'login', 'sid': sid}
    r = session.post(url, headers=headers, params=parameters)
    return r.text

def get_url_arg(url, arg):
    parsed = urlparse.urlparse(url)
    return urlparse.parse_qs(parsed.query)[arg][0]

def set_url_arg(url, arg, val):
    url_parts = list(urlparse.urlparse(url))
    query = dict(urlparse.parse_qsl(url_parts[4]))
    query.update({arg : str(val)})
    url_parts[4] = urlencode(query)
    return urlparse.urlunparse(url_parts)

def get_only_pagination_number_buttons(tag):
    if tag.a == None:
        return False
    if not tag.a.has_attr('class'):
        return False
    if not 'button' in tag.a['class']:
        return False
    if not tag.a.has_attr('role'):
        return False
    if not 'button' in tag.a['role']:
        return False
    if not tag.a.has_attr('href'):
        return False
    if tag.a['href'] == '#':
        return False
    if tag.has_attr('class'):
        return False
    return True

# TODO improve merger
class Merge:
    def __init__(self, htmls=[]):
        if len(htmls) < 1:
            self.merged = ''
            return None
        text = ''.join(html for html in htmls)
        soup = BeautifulSoup(text, 'html.parser')
        text2 = ''.join(str(tag) for tag in list(soup.children))
        main_html = list(soup.children)[0]
        main_html.string = text2
        for tag in soup.children:
            if tag is not main_html:
                tag.extract()
        self.merged = str(soup)

class Request:
    def __init__(self, url='', pages=1, username=None, password=None):
        self.html = ''
        self.url = url
        html = get_html(url=self.url, password=password, username=username)
        print(html)
        parser = BeautifulSoup(html, 'html.parser')
        isMultiPage = False
        for pagination in parser.find_all('div', {'class' : 'pagination'}):
            isMultiPage = True
            break
        if not isMultiPage:
            self.text = html
            return None
        max_start = 0
        for pagination in parser.find_all('div', {'class' : 'pagination'}):
            for ul in pagination.find_all('ul'):
                for li in ul.find_all(get_only_pagination_number_buttons):
                    max_start = max(max_start, int(li.find_all('a')[0].string))
                break
        typ = ''
        if url.find('viewtopic') != -1:
            typ = 'topic'
        else:
            typ = 'forum'
        page_count = 0
        start = 0
        htmls = []
        while page_count < pages:
            page_url = self.url
            page_url = set_url_arg(page_url, 'start', start)
            html = get_html(url=page_url, password=password, username=username)
            htmls.append(html)
            page_count+=1
            start+=int(20 if typ == 'forum' else 10)
        merge = Merge(htmls=htmls)
        self.text = merge.merged

def get_all_forumbg(tag):
    if not tag.has_attr('class'):
        return False
    if 'forumbg' in tag['class']:
        return True
    if 'forabg' in tag['class']:
        return True

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('url', help='Url for forum page')
parser.add_argument('--password', help='Password for your forum account')
parser.add_argument('--username', help='Username for your forum account')
parser.add_argument('--section', help='Name of section')
parser.add_argument('--amount', help='Amount of topics your want, default is 20')
args = parser.parse_args()

if args.amount == None:
    args.amount = 20

request = Request(url=args.url, pages=int(int(args.amount)/20), username=args.username, password=args.password)
soup = BeautifulSoup(request.text, 'html.parser')

section_map = {}

current_section = ''

for forumbg in soup.find_all(get_all_forumbg):
    for topiclist in soup.find_all('ul', {'class' : 'topiclist'}):
        if 'forums' in topiclist['class']:
            for topictitle in topiclist.find_all('a', {'class' : 'topictitle'}):
                section_map[current_section].append(topictitle.contents[2].strip())
        elif 'topics' in topiclist['class']:
            for topictitle in topiclist.find_all('a', {'class' : 'topictitle'}):
                section_map[current_section].append(topictitle.string.strip())
        else:
            current_section = topiclist.find_all('div', {'class':'list-inner'})[0].string
            if current_section == None:
                continue
            if not current_section in section_map.keys():
                section_map[current_section] = []

count = 0

if args.section != None:
    if args.section in section_map.keys():
        for item in section_map[args.section]:
            if args.amount != None:
                if count < int(args.amount):
                    print(item)
                    count+=1
    else:
        print('Invalid section')
else:
    for section in section_map.keys():
        for item in section_map[section]:
            if args.amount != None:
                if count < int(args.amount):
                    print(item)
                    count+=1

#5 Post by **Nite Coder** » Thu Aug 08, 2019 11:38 am

A complete version! You can now get results from with your account! It supports returning certain number of topics and which section to chose from.

Code: Select all

#!/usr/bin/python3

import requests
from bs4 import BeautifulSoup
import urllib.parse as urlparse
from urllib.parse import urlencode

forum = 'https://forum.mxlinux.org/'

def get_html(url='', password=None, username=None):
    if password == None or username == None:
        return requests.get(url).text
    session = requests.Session()
    headers = {'User-Agent' : 'Mozilla/5.0'}
    payload = {'username': username, 'password': password, 'redirect':'index.php' , 'login':'Login'}
    r = session.post(forum + "ucp.php?mode=login", headers=headers, data=payload)
    sidStart = r.text.find("sid")+4
    sid = r.text[sidStart:sidStart+32]
    parameters = {'mode': 'login', 'sid': sid}
    res = session.get(url, headers=headers, params=parameters)
    return res.text

def get_url_arg(url, arg):
    parsed = urlparse.urlparse(url)
    return urlparse.parse_qs(parsed.query)[arg][0]

def set_url_arg(url, arg, val):
    url_parts = list(urlparse.urlparse(url))
    query = dict(urlparse.parse_qsl(url_parts[4]))
    query.update({arg : str(val)})
    url_parts[4] = urlencode(query)
    return urlparse.urlunparse(url_parts)

def get_only_pagination_number_buttons(tag):
    if tag.a == None:
        return False
    if not tag.a.has_attr('class'):
        return False
    if not 'button' in tag.a['class']:
        return False
    if not tag.a.has_attr('role'):
        return False
    if not 'button' in tag.a['role']:
        return False
    if not tag.a.has_attr('href'):
        return False
    if tag.a['href'] == '#':
        return False
    if tag.has_attr('class'):
        return False
    return True

# TODO improve merger
class Merge:
    def __init__(self, htmls=[]):
        if len(htmls) < 1:
            self.merged = ''
            return None
        text = ''.join(html for html in htmls)
        soup = BeautifulSoup(text, 'html.parser')
        text2 = ''.join(str(tag) for tag in list(soup.children))
        main_html = list(soup.children)[0]
        main_html.string = text2
        for tag in soup.children:
            if tag is not main_html:
                tag.extract()
        self.merged = str(soup)

class Request:
    def __init__(self, url='', pages=1, username=None, password=None):
        self.html = ''
        self.url = url
        html = get_html(url=self.url, password=password, username=username)
        parser = BeautifulSoup(html, 'html.parser')
        isMultiPage = False
        for pagination in parser.find_all('div', {'class' : 'pagination'}):
            isMultiPage = True
            break
        if not isMultiPage:
            self.text = html
            return None
        max_start = 0
        for pagination in parser.find_all('div', {'class' : 'pagination'}):
            for ul in pagination.find_all('ul'):
                for li in ul.find_all(get_only_pagination_number_buttons):
                    max_start = max(max_start, int(li.find_all('a')[0].string))
                break
        typ = ''
        if url.find('viewtopic') != -1:
            typ = 'topic'
        else:
            typ = 'forum'
        page_count = 0
        start = 0
        htmls = []
        while page_count < pages:
            page_url = self.url
            page_url = set_url_arg(page_url, 'start', start)
            html = get_html(url=page_url, password=password, username=username)
            htmls.append(html)
            page_count+=1
            start+=int(20 if typ == 'forum' else 10)
        merge = Merge(htmls=htmls)
        self.text = merge.merged

def get_all_forumbg(tag):
    if not tag.has_attr('class'):
        return False
    if 'forumbg' in tag['class']:
        return True
    if 'forabg' in tag['class']:
        return True

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('url', help='Url for forum page')
parser.add_argument('--password', help='Password for your forum account')
parser.add_argument('--username', help='Username for your forum account')
parser.add_argument('--section', help='Name of section')
parser.add_argument('--amount', help='Amount of topics your want, default is 20')
args = parser.parse_args()

if args.amount == None:
    args.amount = 20

pages = int(int(args.amount)/20)
if pages == 0:
    pages = 1

request = Request(url=args.url, pages=pages, username=args.username, password=args.password)
soup = BeautifulSoup(request.text, 'html.parser')

section_map = {}

current_section = ''

for forumbg in soup.find_all(get_all_forumbg):
    for topiclist in soup.find_all('ul', {'class' : 'topiclist'}):
        if 'forums' in topiclist['class']:
            for topictitle in topiclist.find_all('a', {'class' : 'topictitle'}):
                section_map[current_section].append(topictitle.contents[2].strip())
        elif 'topics' in topiclist['class']:
            for topictitle in topiclist.find_all('a', {'class' : 'topictitle'}):
                section_map[current_section].append(topictitle.string.strip())
        else:
            current_section = topiclist.find_all('div', {'class':'list-inner'})[0].string
            if current_section == None:
                continue
            if not current_section in section_map.keys():
                section_map[current_section] = []

count = 0

def f7(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

# Get rid of duplicates
for key in section_map.keys():
    section_map[key] = f7(section_map[key])


if args.section != None:
    if args.section in section_map.keys():
        for item in section_map[args.section]:
            if args.amount != None:
                if count < int(args.amount):
                    print(item)
                    count+=1
    else:
        print('Invalid section')
else:
    for section in section_map.keys():
        for item in section_map[section]:
            if args.amount != None:
                if count < int(args.amount):
                    print(item)
                    count+=1

#6 Post by **asqwerth** » Thu Aug 08, 2019 11:45 am

I'm always so impressed by people who can fiddle around and come up with things like this just for fun.

#7 Post by **Nite Coder** » Thu Aug 08, 2019 11:46 am

asqwerth wrote: ↑Thu Aug 08, 2019 11:45 am I'm always so impressed by people who can fiddle around and come up with things like this just for fun.

Thank you!

#8 Post by **richb** » Thu Aug 08, 2019 12:03 pm

An interesting option for those who like using the CLI. I would never have thought of that approach.

#9 Post by **asqwerth** » Thu Aug 08, 2019 12:06 pm

I'm sure one could use the script in a conky....

But I haven't even tested the script, so I don't know how it looks. I'll check it out over the weekend.

#10 Post by **richb** » Thu Aug 08, 2019 12:12 pm

One could also set up a desktop or panel launcher

MX Linux Forum

The real way to check the forum

The real way to check the forum

Re: The real way to check the forum

Re: The real way to check the forum

Re: The real way to check the forum

Re: The real way to check the forum

Re: The real way to check the forum

Re: The real way to check the forum

Re: The real way to check the forum

Re: The real way to check the forum

Re: The real way to check the forum