Source code for corpkit.dictionaries.wordlists

"""
lists of closed class words
"""

# feel free to correct/add things---this was just a quick grab from the web

def closed_class_wordlists():
    """add words here if need be"""
    from collections import namedtuple

    pronouns = [u"all",
            u"another",
            u"any",
            u"anybody",
            u"anyone",
            u"anything",
            u"both",
            u"each",
            u"each",
            u"other",
            u"either",
            u"everybody",
            u"everyone",
            u"everything",
            u"few",
            u"he",
            u"her",
            u"hers",
            u"herself",
            u"him",
            u"himself",
            u"his",
            u"it",
            u"i",
            u"its",
            u"itself",
            u"many",
            u"me",
            u"mine",
            u"more",
            u"most",
            u"much",
            u"myself",
            u"neither",
            u"no",
            u"one",
            u"nobody",
            u"none",
            u"nothing",
            u"one",
            u"another",
            u"other",
            u"others",
            u"ours",
            u"ourselves",
            u"several",
            u"she",
            u"some",
            u"somebody",
            u"someone",
            u"something",
            u"that",
            u"their",
            u"theirs",
            u"them",
            u"there",
            u"themselves",
            u"these",
            u"they",
            u"this",
            u"those",
            u"us",
            u"we",
            u"what",
            u"whatever",
            u"which",
            u"whichever",
            u"who",
            u"whoever",
            u"whom",
            u"whomever",
            u"whose",
            u"you",
            u"your",
            u"yours",
            u"yourself",
            u"yourselves"]

    articles = [u"a",
            u"an",
            u"the",
            u"teh"]

    determiners = [u"all",
                u"anotha",
                u"another",
                u"any",
                u"any-and-all",
                u"atta",
                u"both",
                u"certain",
                u"couple",
                u"dat",
                u"dem",
                u"dis",
                u"each",
                u"either",
                u"enough",
                u"enuf",
                u"enuff",
                u"every",
                u"few",
                u"fewer",
                u"fewest",
                u"her",
                u"hes",
                u"his",
                u"its",
                u"last",
                u"least",
                #u"little",
                u"many",
                u"more",
                u"most",
                u"much",
                u"muchee",
                u"my",
                u"neither",
                #u"next",
                u"nil",
                u"no",
                u"none",
                u"other",
                u"our",
                u"overmuch",
                u"owne",
                u"plenty",
                u"quodque",
                #u"said",
                u"several",
                u"some",
                u"such",
                u"sufficient",
                u"that",
                u"their",
                u"them",
                u"these",
                u"they",
                u"thilk",
                u"thine",
                u"this",
                u"those",
                u"thy",
                u"umpteen",
                u"us",
                u"various",
                u"wat",
                u"we",
                u"what",
                u"whatever",
                u"which",
                u"whichever",
                u"yonder",
                u"you",
                u"your"]

    prepositions = [u"about",
                u"above",
                u"across",
                u"after",
                u"against",
                u"along",
                u"among",
                u"around",
                u"at",
                u"before",
                u"behind",
                u"below",
                u"beneath",
                u"beside",
                u"between",
                u"by",
                u"down",
                u"during",
                u"except",
                u"for",
                u"from",
                u"front",
                u"in",
                u"inside",
                u"instead",
                u"into",
                u"like",
                u"near",
                u"of",
                u"off",
                u"on",
                u"onto",
                u"out",
                u"outside",
                u"over",
                u"past",
                u"since",
                u"through",
                u"to",
                u"top",
                u"toward",
                u"under",
                u"underneath",
                u"until",
                u"up",
                u"upon",
                u"with",
                u"within",
                u"without"]

    connectors = [u"about",
        u"above",
        u"across",
        u"after",
        u"against",
        u"along",
        u"among",
        u"around",
        u"at",
        u"before",
        u"behind",
        u"below",
        u"beneath",
        u"beside",
        u"between",
        u"by",
        u"down",
        u"during",
        u"except",
        u"for",
        u"from",
        u"front",
        u"in",
        u"inside",
        u"instead",
        u"into",
        u"like",
        u"near",
        u"of",
        u"off",
        u"on",
        u"onto",
        u"out",
        u"outside",
        u"over",
        u"past",
        u"since",
        u"through",
        u"to",
        u"top",
        u"toward",
        u"under",
        u"underneath",
        u"until",
        u"up",
        u"upon",
        u"with",
        u"within",
        u"without"]

    modals = [u"would", 
        u"will", 
        u"can", 
        u"could", 
        u"may", 
        u"should", 
        u"might", 
        u"must", 
        u"ca", 
        u"'ll", 
        u"'d", 
        u"wo", 
        u"ought", 
        u"need", 
        u"shall", 
        u"dare", 
        u"shalt"]

    conjunctions = [u"though",
                u"although",
                u"even though",
                u"while",
                u"if",
                u"only if",
                u"unless",
                u"until",
                u"provided that",
                u"assuming that",
                u"even if",
                u"in case",
                u"lest",
                u"than",
                u"rather than",
                u"whether",
                u"as much as",
                u"whereas",
                u"after",
                u"as long as",
                u"as soon as",
                u"before",
                u"by the time",
                u"now that",
                u"once",
                u"since",
                u"till",
                u"until",
                u"when",
                u"whenever",
                u"while",
                u"because",
                u"since",
                u"so that",
                u"why",
                u"that",
                u"what",
                u"whatever",
                u"which",
                u"whichever",
                u"who",
                u"whoever",
                u"whom",
                u"whomever",
                u"whose",
                u"how",
                u"as though",
                u"as if",
                u"where",
                u"wherever",
                u"for",
                u"and",
                u"nor",
                u"but",
                u"or",
                u"yet",
                u"so",
                u"however"]

    stopwords =  ["yeah", "monday","tuesday","wednesday","thursday","friday",
             "saturday","sunday","a","able","about","above","abst","accordance",
             "according","accordingly","across","act","actually","added","adj",
             "adopted","affected","affecting","affects","after","afterwards",
             "again","against","ah","all","almost","alone","along","already",
             "also","although","always","am","among","amongst","an","and",
             "announce","another","any","anybody","anyhow","anymore","anyone",
             "anything","anyway","anyways","anywhere","apparently","approximately",
             "are","aren","arent","arise","around","as","aside","ask","asking","at",
             "auth","available","away","awfully","b","back","be","became","because",
             "become","becomes","becoming","been","before","beforehand","begin",
             "beginning","beginnings","begins","behind","being","believe","below",
             "beside","besides","between","beyond","biol","both","brief","briefly",
             "but","by","c","ca","came","can","cannot","cant","cause","causes",
             "certain","certainly","co","com","come","comes","contain","containing",
             "contains","could","couldnt","d","date","did","didnt","different","do",
             "does","doesnt","doing","done","dont","down","downwards","due","during",
             "e","each","ed","edu","effect","eg","eight","eighty","either","else",
             "elsewhere","end","ending","enough","especially","et","et-al","etc","even",
             "ever","every","everybody","everyone","everything","everywhere","ex",
             "except","f","far","few","ff","fifth","first","five","fix","followed",
             "following","follows","for","former","formerly","forth","found","four",
             "from","further","furthermore","going","g","gave","get","gets","getting",
             "give","given","gives","giving","go","goes","gone","got","gotten","h",
             "had","happens","hardly","has","hasnt","have","havent","having","he",
             "hed","hence","her","here","hereafter","hereby","herein","heres",
             "hereupon","hers","herself","hes","hi","hid","him","himself","his",
             "hither","home","how","howbeit","however","hundred","i","id","ie",
             "if","ill","im","immediate","immediately","importance","important",
             "in","inc","indeed","index","information","instead","into","invention",
             "inward","is","isnt","it","itd","itll","its","itself","ive","j","just",
             "k","keep","keeps","kept","keys","kg","km","know","known","knows",
             "l","largely","last","lately","later","latter","latterly","least","less",
             "lest","let","lets","like","liked","likely","line","little","ll","look",
             "looking","looks","ltd","m","made","mainly","make","makes","many","may",
             "maybe","me","mean","means","meantime","meanwhile","merely","mg","might",
             "million","miss","ml","more","moreover","most","mostly","mr","mrs","much",
             "mug","must","my","myself","n","na","name","namely","nay","nd","near",
             "nearly","necessarily","necessary","need","needs","neither","never",
             "nevertheless","new","next","nine","ninety","no","nobody","non","none",
             "nonetheless","noone","nor","normally","nos","not","noted","nothing",
             "now","nowhere","o","obtain","obtained","obviously","of","off","often",
             "oh","ok","okay","old","omitted","on","once","one","ones","only","onto",
             "or","ord","other","others","otherwise","ought","our","ours","ourselves",
             "out","outside","over","overall","owing","own","p","page","pages","part",
             "particular","particularly","past","per","perhaps","placed","please",
             "plus","poorly","possible","possibly","potentially","pp","predominantly",
             "present","previously","primarily","probably","promptly","proud","provides",
             "put","q","que","quickly","quite","qv","r","ran","rather","rd","re",
             "readily","really","recent","recently","ref","refs","regarding","regardless",
             "regards","related","relatively","research","respectively","resulted",
             "resulting","results","right","run","s","said","same","saw","say","saying",
             "says","sec","section","see","seeing","seem","seemed","seeming","seems",
             "seen","self","selves","sent","seven","several","shall","she","shed","shell",
             "shes","should","shouldnt","show","showed","shown","showns","shows",
             "significant","significantly","similar","similarly","since","six",
             "slightly","so","some","somebody","somehow","someone","somethan","something",
             "sometime","sometimes","somewhat","somewhere","soon","sorry","specifically",
             "specified","specify","specifying","state","states","still","stop",
             "strongly","sub","substantially","successfully","such","sufficiently",
             "suggest","sup","sure","t","take","taken","taking","tell","tends","th",
             "than","thank","thanks","thanx","that","thatll","thats","thatve","the",
             "their","theirs","them","themselves","then","thence","there","thereafter",
             "thereby","thered","therefore","therein","therell","thereof","therere",
             "theres","thereto","thereupon","thereve","these","they","theyd","theyll",
             "theyre","theyve","think","this","those","thou","though","thoughh","thousand",
             "throug","through","throughout","thru","thus","til","tip","to","together",
             "too","took","toward","towards","tried","tries","truly","try","trying",
             "ts","twice","two","u","un","under","unfortunately","unless","unlike",
             "unlikely","until","unto","up","upon","ups","us","use","used","useful",
             "usefully","usefulness","uses","using","usually","v","value","various","ve",
             "very","via","viz","vol","vols","vs","w","want","wants","was","wasnt","way",
             "we","wed","welcome","well","went","were","werent","weve","what","whatever",
             "whatll","whats","when","whence","whenever","where","whereafter","whereas",
             "whereby","wherein","wheres","whereupon","wherever","whether","which",
             "while","whim","whither","who","whod","whoever","whole","wholl","whom",
             "whomever","whos","whose","why","widely","willing","wish","with","within",
             "without","wont","words","world","would","wouldnt","www","x","y","yes","yet",
             "you","youd","youll","your","youre","yours","yourself","yourselves","youve",
             "z","zero", "isn", "doesn","didn", "couldn", "mustn","shoudn","wasn","woudn",
             "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", 
             "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", 
             "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", 
             "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", 
             "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", 
             "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", 
             "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", 
             "about", "against", "between", "into", "through", "during", "before", "after", 
             "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", 
             "over", "under", "again", "further", "then", "once", "here", "there", "when", 
             "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", 
             "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", 
             "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", 
             "now", "gonna", "n't", '-lrb-', '-rrb-', "'m", "'ll", "'re", "'s", "'ve", "&"]

    titlewords = [u'admiral', u'archbishop', u'alan', u'merrill', u'sarah', 
              'queen', u'king', u'sen', u'chancellor', u'prime minister', 
              'cardinal', u'bishop', u'father', u'hon', u'rev', u'reverend', 
              'pope', u'sir', u'doctor', u'professor', u'president', 
              'senator', u'congressman', u'congresswoman', u'mr', u'ms', 
              'mrs', u'miss', u'dr', u'bill', u'hillary', u'hillary rodham', 
              'saddam', u'osama', u'ayatollah', u'george', u'george w', 
              'mitt', u'malcolm', u'barack', u'ronald', u'john', u'john f', 
              'william', u'al', u'bob']

    whpro = [u'who', u'what',u'why', u'where',u'when', u'how']

    other = ['not']

    #all_lists = [pronouns, articles, determiners, prepositions, connectors, modals]
    from corpkit.dictionaries.process_types import Wordlist
    closedclass = sorted(list(set(pronouns + articles + conjunctions + determiners + prepositions + connectors + modals + other)))
    outputnames = namedtuple('wordlists', ['pronouns', 'conjunctions', 'articles', 'determiners', 'prepositions', 'connectors', 'modals', 'closedclass', 'stopwords', 'titles', 'whpro'])
    eachlist = [pronouns, conjunctions, articles, determiners, prepositions, connectors, modals, closedclass, stopwords, titlewords, whpro]
    eachlist = [Wordlist(l, single=True) for l in eachlist]
    output = outputnames(*eachlist)
    return output

wordlists = closed_class_wordlists()