Filtering pull requests by non-dev authorship

I’m getting back into reviewing pull requests at the moment, and want to start spending a bit more time on reviewing pull requests made by new or relatively new contributors. Does anyone know a good way to filter the PR list to exclude anyone in https://github.com/orgs/matplotlib/teams/developers?

I have tried searching github to exclude given individual accounts, but it seems like it’s not possible to chain together searchers with OR, AND etc.

1 Like

I don’t know if there is a way through the GH UI, but if you pull the issues with the API there is an 'author_association' field that gives you that information.

I would definitely merge a CLI tool that printed out the list of currently open PRs from first time contributors…


Warning: this is some internal throw-away-code that I have been playing with, you get negative warranty with it :wink:

import aiohttp
import gidgethub
import gidgethub.aiohttp
import re
import pymongo
import asyncio

conn = pymongo.MongoClient()
db = conn.get_database('mpl_info')
col = db.get_collection('log')
m_cache = db.get_collection('cache')
issues = db.get_collection('issues')


def dump_cache(cache, m_cache):
    for k, v in cache.items():
        m_cache.replace_one({'k': k}, {'k': k, 'v': v}, upsert=True)

def restore_cache(cache, m_cache):
    for doc in m_cache.find():
        cache[doc['k']] = doc['v']


try:
    with open(os.path.expanduser('~/.ghoauth'), 'r') as f:
        oauth_token = f.read()
except FileNotFoundError:
    oauth_token = None

try:
    cache
except NameError:
    cache = {}
    restore_cache(cache, m_cache)

async def get_issues(org, repo, oauth_token):
     """This gets all issues...ever...becareful"""
    async with aiohttp.ClientSession() as session:
        gh = gidgethub.aiohttp.GitHubAPI(session, 'tacaswell',
                                         oauth_token=oauth_token, cache=cache)
        data = []
        async for d in gh.getiter(f"/repos/{org}/{repo}/issues{{?state}}",
                                  {'state': 'all'}):
            data.append(d)

        dump_cache(cache, m_cache)
        return data

async def get_new_contributor_prs(org, repo, oauth_token):
    async with aiohttp.ClientSession() as session:
        gh = gidgethub.aiohttp.GitHubAPI(session, 'tacaswell',
                                         oauth_token=oauth_token, cache=cache)
        data = []
        async for d in gh.getiter(f"/repos/{org}/{repo}/issues{{?state}}",
                                  {'state': 'open'}):
            if (d['author_association'] in {'CONTRIBUTOR', 'FIRST_TIME_CONTRIBUTOR'} and
                'pull_request' in d):

                data.append(d)
            if len(data) > 10:
                break

        dump_cache(cache, m_cache)
        return data

## this will run in IPython..
# new_issues = await get_new_contributor_prs('matplotlib', 'matplotlib', oauth_token)
# for d in new_issues:
#    print(f"{d['user']['login']: <20} {d['author_association']: <24} {d['pull_request']['html_url']}")


which prints

emilyfy              FIRST_TIME_CONTRIBUTOR   https://github.com/matplotlib/matplotlib/pull/18536
l-johnston           CONTRIBUTOR              https://github.com/matplotlib/matplotlib/pull/18529
andrzejnovak         CONTRIBUTOR              https://github.com/matplotlib/matplotlib/pull/18511
xordux               FIRST_TIME_CONTRIBUTOR   https://github.com/matplotlib/matplotlib/pull/18493
larsoner             CONTRIBUTOR              https://github.com/matplotlib/matplotlib/pull/18478
greglucas            CONTRIBUTOR              https://github.com/matplotlib/matplotlib/pull/18472
casperdcl            FIRST_TIME_CONTRIBUTOR   https://github.com/matplotlib/matplotlib/pull/18397
ShawnChen1996        FIRST_TIME_CONTRIBUTOR   https://github.com/matplotlib/matplotlib/pull/18340
grrlic               FIRST_TIME_CONTRIBUTOR   https://github.com/matplotlib/matplotlib/pull/18257
Huzaib               FIRST_TIME_CONTRIBUTOR   https://github.com/matplotlib/matplotlib/pull/18198
tomneep              CONTRIBUTOR              https://github.com/matplotlib/matplotlib/pull/18114

We would probably want to use a sync API, parameterize it, etc, but the information to do this is definitely available!

Yeah, in the same vein as Tom’s reply is using the github API:

import requests
import pandas as pd
from tqdm import tqdm_notebook
import dask
import dask.bag as db
from dask.diagnostics import ProgressBar
URL = 'https://api.github.com/repos/matplotlib/matplotlib'

def paged_requests(page):
    r = requests.get(f'{URL}/pulls', auth=auth,
                     params={'page':page, 'per_page':100, 'state':'all'})
    return r.json()

pull_pages = db.from_sequence(range(1, (10000//100)))
pulls = pull_pages.map(paged_requests)
pull_dfs = pulls.map(pd.io.json.json_normalize)

with ProgressBar():
    #single threaded cause github abuse and yes, should just use loop at this point
     pull_requests = pull_dfs.compute(scheduler='single-threaded')

pdf = pd.concat(pull_requests, sort=False)

will get you a spreadsheet w/ all the PRs that you can filter on the author_association

Fab, thanks a lot for the replies, I will have a play around with the github API and try and get something working on my end!

1 Like