Analyse 2: Keyword in Context (KWIC)#
0. Importe und Daten-Upload#
import pandas as pd
import re
from pathlib import Path
## for interactivity in jupyter books
from bokeh.io import output_notebook, show
from bokeh.layouts import column
from bokeh.models import CustomJS, TextInput, Div
# Ensure Bokeh output is displayed in the notebook
output_notebook()
conllfiles = Path(r"../data/csv")
corpus_annotations = {}
for file in conllfiles.iterdir():
if file.suffix == '.csv':
#path = os.path.join(conllfiles, filename)
data = pd.read_csv(file)
corpus_annotations[file.name] = data
corpus_metadata = pd.read_csv(Path('../data/metadata/QUADRIGA_FS-Text-01_Data01_Corpus-Table.csv'), sep=';')
corpus_metadata = corpus_metadata.set_index('DC.identifier')
1. KWIC-Suche#
Show code cell content
class ContextViewer:
def __init__(self, corpus_annotated, corpus_metadata):
self.prepare_index_dataframe_for_search(corpus_annotated, corpus_metadata)
def prepare_index_dataframe_for_search(self, corpus_annotated, corpus_metadata):
for filename, annotated_text in corpus_annotated.items():
txtname = filename.replace('.csv', '')
if txtname in corpus_metadata.index:
year, month, day = self.get_date_fname(txtname, corpus_metadata)
annotated_text['month'] = month
annotated_text['filename'] = filename
self.full_df = pd.concat(corpus_annotated.values())
self.full_df = self.full_df.reset_index()
print(f'Searching in a corpus of {self.full_df.shape[0]} word occurences')
def get_date_fname(self, txtname, corpus_metadata):
date = corpus_metadata.loc[txtname, 'DC.date']
date = str(date)
year = date[:4]
month = date[:7]
day = date
return year, month, day
def get_context_words(self, search_terms, n_words):
#search_terms = input('Insert a word to search, split by comma if more than one: ')
if len(search_terms) == 0:
search_terms = 'Grippe, Krankheit'
search_terms = search_terms.split(',')
search_terms = [x.strip() for x in search_terms]
indices = self.full_df.query(f'Lemma.isin({search_terms})').index
#print(indices)
left_contexts = []
this_words = []
right_contexts = []
months = []
for indice in indices:
left = self.full_df.iloc[indice-10:indice-1, ]["Token"]
leftс = left[~left.str.contains('\n')]
right = self.full_df.iloc[indice+1:indice+10, ]["Token"]
rightс = right[~right.str.contains('\n')]
left_contexts.append(' '.join(leftс))
right_contexts.append(' '.join(rightс))
this_words.append(self.full_df.iloc[indice, ]["Token"])
months.append(self.full_df.iloc[indice, ]["month"])
newdf = pd.DataFrame()
newdf['left_context'] = left_contexts
newdf['word'] = this_words
newdf['right_context'] = right_contexts
newdf['month'] = months
return newdf
## currently unused functionality:
def get_context_sents(self, n_sentences):
search_lemma = input('Insert a word to search: ')
if len(search_lemma) == 0:
search_lemma = 'Grippe'
indices = self.full_df.query(f'Lemma=="{search_lemma}"').index
#print(indices)
left_contexts = []
this_sentences = []
right_contexts = []
months = []
for indice in indices:
#print(indice)
current_filename = self.full_df.iloc[indice, ]["filename"]
current_sentence_id = self.full_df.iloc[indice, ]["Sentence_idx"]
left_context = self.get_sents(direction=-1,
current_filename=current_filename,
current_sentence_id=current_sentence_id,
n_sentences=n_sentences)
left_contexts.append(left_context)
right_context = self.get_sents(direction=1,
current_filename=current_filename,
current_sentence_id=current_sentence_id,
n_sentences=n_sentences)
right_contexts.append(right_context)
this_sentence = self.get_sents(direction=0,
current_filename=current_filename,
current_sentence_id=current_sentence_id,
n_sentences=1)
this_sentences.append(this_sentence)
#this_words.append(self.full_df.iloc[indice, ]["Token"])
months.append(self.full_df.iloc[indice, ]["month"])
newdf = pd.DataFrame()
newdf['left_sentences'] = left_contexts
newdf['this_sentence'] = this_sentences
newdf['right_sentences'] = right_contexts
newdf['month'] = months
return newdf #.sort_values(by='month')
def get_sents(self, direction, current_filename, current_sentence_id, n_sentences):
sentences = []
for n in range(1,n_sentences+1):
sentence_id = current_sentence_id + (n * direction)
this_sentence = self.create_sentence(current_filename, sentence_id)
sentences.append(this_sentence)
#print(' '.join(sentences))
return ' '.join(sentences)
def create_sentence(self, current_filename, sentence_id):
words = self.full_df.query(f'filename=="{current_filename}" and Sentence_idx=={sentence_id}')['Token']
sentence = ' '.join(words)
#print(sentence)
return sentence
search_terms = TextInput(value='Grippe, Krankheit',
title="Geben Sie die zu suchenden Wörter ein und trennen Sie sie durch Kommas, wenn es mehrere sind:") #input('Insert words to search, split by comma if more than one: ')
search_terms_str = search_terms.value.strip()
# JavaScript callback to update the in Jupyter Book
rewrite_var_after_input = CustomJS(args=dict(text_input=search_terms), code="""
var word = text_input.value.trim();
console.log('Input value:', word);
function sendToPython(){
var kernel = IPython.notebook.kernel;
kernel.execute("search_terms_str = '" + word + "'");
}
sendToPython();
""")
search_terms.js_on_change('value', rewrite_var_after_input)
# Layout and display
layout = column(search_terms)
show(layout)
kwic = ContextViewer(corpus_annotations, corpus_metadata)
Searching in a corpus of 33192061 word occurences
kwic.get_context_words(search_terms_str, n_words=5)
left_context | word | right_context | month | |
---|---|---|---|---|
0 | Wirtschaft bedeutet , den recken ohne Ende , eine | Krankheit | , eine Aufein- anderfolge von Fieberschauern .... | 1919-06 |
1 | Heute Nathinittag- entschlief . , Fantt | nach „ | Krankheit | meine liebe Frau ; unjere gute , treue Mutter | 1919-06 |
2 | „ mit seinem Christentum nicht evnst . Meyrh... | Krankheit | verhinderk gewesen Feten , | ri beschlofsen ... | 1919-04 |
3 | wurde uns am Gonntay , nachmittag nac kurzer , | Krankheit | dur den Tod entrissen . In tiefstem - | 1919-12 |
4 | den Technischen Staats- lehranstalten . j ' 2 | Krankheit | des trüheren Kailerpaares . Drähtmelbung 904 w... | 1918-12 |
... | ... | ... | ... | ... |
1257 | ! [ eS Bt eie eabhere Geht u den | Grippe | wegen geschlossenen 309 Groß- foricht nict der... | 1918-10 |
1258 | die erforderlichen Konsequen- | 3je " BoserLis... | Grippe | erkrankt ist ; infolgedessen haben | befriedig... | 1918-10 |
1259 | fein Stimmrecht auszuliben . der dr jen : 1. | Krankheit | , 2 WE shlebfan und wii iO > < | 1918-05 |
1260 | vor der National » alerie , ist gestern nach | Krankheit | in einer Berliner Wohnung gestorven . Prof. Tu... | 1919-02 |
1261 | Biersteuer « Auf Anordnung der Sowset-Regier... | Grippe | erkrankt war und erst seit kur ; Zustimmung hi... | 1918-08 |
1262 rows × 4 columns