Question
write a python assignment to create an ngram linear model
Solution:
import argparse
import math
import random
from nltk.tokenize import sent_tokenize, word_tokenize
from typing import List
from typing import Tuple
from typing import Generator
import nltk
nltk.download('punkt')
# Generator for all n-grams in text
# n is a (non-negative) int
# text is a list of strings
# Yields n-gram tuples of the form (string, context), where context is a tuple of strings
def get_ngrams(n: int, text: List[str]) -> Generator[Tuple[str, Tuple[str, ...]], None, None]:
token_sep = ''
text_edited = [token_sep]*(n-1)
text_edited.extend(text)
text_edited.append(token_sep)
list_ngrams = list(zip(*[text_edited[i:] for i in range(n)]))
for i in list_ngrams:
yield tuple((i[n-1], tuple(i[:n-1])))
# Loads and tokenizes a corpus
# corpus_path is a string
# Returns a list of sentences, where each sentence is a list of strings
def load_corpus(corpus_path: str) -> List[List[str]]:
corpus = open(corpus_path).read()
paragraph = corpus.split('\n\n')
sentence = []
words = []
for par in paragraph:
sentence.extend(sent_tokenize(par))
for sent in sentence:
words.append(word_tokenize(sent))
return words
# Builds an n-gram model from a corpus
# n is a (non-negative) int
# corpus_path is a string
# Returns an NGramLM
def create_ngram_lm(n: int, corpus_path: str) -> 'NGramLM':
corpus_data = load_corpus(corpus_path)
ngramLM = NGramLM(n)
for corpus in corpus_data:
ngramLM.update(corpus)
return ngramLM
# An n-gram language model
class NGramLM:
def __init__(self, n: int):
self.n = n
self.ngram_counts = {}
self.context_counts = {}
self.vocabulary = set()
# Updates internal counts based on the n-grams in text
# text is a list of strings
# No return value
def update(self, text: List[str]) -> None:
self.vocabulary |= set(text)
self.vocabulary |= set([''])
for ngram in get_ngrams(self.n, text):
self.ngram_counts[ngram] = 1 + self.ngram_counts.get(ngram, 0)
self.context_counts[ngram[1]] = 1 + self.context_counts.get(ngram[1], 0)
# Calculates the MLE probability of an n-gram
# word is a string
# context is a tuple of strings
# delta is an float
# Returns a float
def get_ngram_prob(self, word: str, context: Tuple[str, ...], delta=.0) -> float:
context_count = self.context_counts.get(context, 0)
ngram_count = self.ngram_counts.get((word, context), 0)
if delta == 0:
if context_count == 0:
return 1/(len(self.vocabulary))
else:
return ngram_count/context_count
else:
return (delta + ngram_counts)/(context_count + (delta * len(self.vocabulary)))
# Calculates the log probability of a sentence
# sent is a list of strings
# delta is a float
# Returns a float
def get_sent_log_prob(self, sent: List[str], delta=.0) -> float:
probability = 0
for ngram in get_ngrams(self.n, sent):
ngram_prob = self.get_ngram_prob(ngram[0], ngram[1], delta)
try:
probability += math.log2(ngram_prob)
except:
probability += float('-inf')
return probability
# Calculates the perplexity of a language model on a test corpus
# corpus is a list of lists of strings
# Returns a float
def get_perplexity(self, corpus: List[List[str]]) -> float:
size = 0
corpus_sent_probability = 0
for sent in corpus:
size += len(sent)
sent_prob = self.get_sent_log_prob(sent)
corpus_sent_probability += sent_prob
avg_log_prob = (-1)*(corpus_sent_probability/size)
return math.pow(2, avg_log_prob)
# Samples a word from the probability distribution for a given context
# context is a tuple of strings
# delta is an float
# Returns a string
def generate_random_word(self, context: Tuple[str, ...], delta=.0) -> str:
prob_before = 0
prob_current = 0
sorted_vocabulary = sorted(self.vocabulary)
r = random.random()
for word in sorted_vocabulary:
ngram_probability = self.get_ngram_prob(word, context, delta)
prob_current += ngram_probability
if prob_before < r <= prob_current:
return word
prob_before += ngram_probability
# Generates a random sentence
# max_length is an int
# delta is a float
# Returns a string
def generate_random_text(self, max_length: int, delta=.0) -> str:
end_sentence = False
len_current = 0
text = ['']*(self.n-1)
if max_length == 0:
return ''
first_word = self.generate_random_word(tuple(text), delta)
len_current += 1
text.append(first_word)
while len_current < max_length and not end_sentence:
word = self.generate_random_word(tuple(text[-(self.n-1):]), delta)
text.append(word)
len_current += 1
if word == '':
end_sentence = True
#print(text)
return ' '.join(text[self.n-1:])
def main(corpus_path: str, delta: float, seed: int):
trigram_lm = create_ngram_lm(3, corpus_path)
s1 = 'God has given it to me, let him who touches it beware!'
s2 = 'Where is the prince, my Dauphin?'
print(trigram_lm.get_perplexity([word_tokenize(s1)]))
print(trigram_lm.get_perplexity([word_tokenize(s2)]))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="CS6320 HW1")
parser.add_argument('corpus_path', nargs="?", type=str, default='sentence.txt', help='Path to corpus file')
parser.add_argument('delta', nargs="?", type=float, default=.5, help='Delta value used for smoothing')
parser.add_argument('seed', nargs="?", type=int, default=82761904, help='Random seed used for text generation')
args = parser.parse_args()
random.seed(args.seed)
main(args.corpus_path, args.delta, args.seed)
Similar Samples
Explore our diverse range of programming assignment samples at ProgrammingHomeworkHelp.com. Our meticulously crafted solutions in Java, Python, C++, SQL, and more showcase our expertise in tackling complex programming challenges. These samples serve as a testament to our commitment to delivering high-quality, tailored solutions that ensure academic success. Discover how we can assist you with your programming homework today.
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System
Operating System