import pandas as pd
# read text file
with open('./poetry/shakespear_sonnets.txt') as f:
    contents = f.read()
    #print(contents)


import re

def clean(text):
    #the only cleaning that is needed for this text (it is not always the case) is to use lower()
    text = text.lower() #in this problem, we want to keep the punctuation.
    text = text.replace('\n'," ")
    text = text.replace('\t'," ")
    text = text.replace('“', ' " ')
    text = text.replace('”', ' " ')
    """pattern = r"\b(?=[MDCLXVIΙ])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})([IΙ]X|[IΙ]V|V?[IΙ]{0,3})\b\.?"
    text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    text = re.sub("\s\s+" , " ", text)"""
    return text
    

contents_lower = clean(contents)
#print(contents_lower)


# a way to create the dictionary is by using defaultdict()
# The idea is to create a list with all bigrams in the forme of tuples and the use defaultdict to transform it to a dictionary. 
from collections import defaultdict

def bigram_lst(text):
    bigrams = []
    # first we have to tokenize the text 
    #words = text.split(' ')
    words = re.findall(r"[\w']+|[.,!?;]", text)
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        bigrams.append((current_word, next_word))
    return bigrams

#print(bigram_lst(contents_lower))

def create_markov_chain(bigrams):
    d = defaultdict(list)
    for k, v in bigrams:
        d[k].append(v)
    return d

dico = create_markov_chain(bigram_lst(contents_lower))
#print(dico)


# TODO


import random

def generate_sentence(dico, count=30):
    # improuvement: 1. Capitalize the first word
    #              2. Capitalize the words after the punctuation
    
    
    word1 = random.choice(list(dico.keys()))
    sentence = word1

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(dico[word1])
        word1 = word2
        sentence += ' ' + word2

    # End it with a period
    sentence += '.'
    return(sentence)


generate_sentence(dico)

"i derive , or your worth they draw my music , and my sake ; that thou , for thy beauty's summer is by this most worthy of this hell."


# In the following code: 
# 1. capitalize the beginning of each sentence
# 2. improve the end of the sentece.


def generate_sentence_improved(dico, count=15):
    # improuvement: 1. Capitalize the first word
    #              2. Capitalize the words after the punctuation
    
    word1 = random.choice(list(dico.keys()))
    while (word1 == '.' or word1 == ',' or word1 == '!' or word1 == '?' or word1 ==';'):
        word1 = random.choice(list(dico.keys()))
        
    sentence = word1.capitalize()
    for i in range(count-1):
        word2 = random.choice(dico[word1])
        #print(sentence[-1])
        if (sentence[-1] == '.' or sentence[-1] == '!' or sentence[-1] == '?' or sentence[-1] == ';'):
            sentence += ' ' + word2.capitalize()
            #print(word2.capitalize())
        else :
            sentence += ' ' + word2
        word1 = word2
        
    while (sentence[-1] != '.'):
        word2 = random.choice(dico[word1])
        word1 = word2
        sentence += ' ' + word2
        
    sentence = sentence.replace(' .', '.').replace(' ,', ',').replace(' ?', '?').replace(' !', '!').replace(' ;', ';')
    return(sentence)


generate_sentence_improved(dico)

"Bases for love knows; Then no unkind, who art thou reviewest this sorrow, if ten times refigured thee i'll vow debate, it the other strains of thy 'will' will show it."


#TODO

Text Generation¶

Introduction¶

Corpus¶

Markov Chain¶

Questions¶

Create a Text Generator¶

Compare it with your MM¶