Spaces:

Modfiededition
/

tweet_sentiment_extractor

Runtime error

App Files Files Community

tweet_sentiment_extractor / app.py

Modfiededition

Update app.py

c09e449 about 4 years ago

raw

history blame contribute delete

5.52 kB

	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
	from transformers import AutoTokenizer
	from transformers import TFAutoModelForQuestionAnswering
	from datasets import Dataset
	import streamlit as st




	# loading saved roberta-base tokenizer to tokenize the text into input IDs that model can make sense of.
	model_checkpoint = "Modfiededition/roberta-fine-tuned-tweet-sentiment-extractor"

	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	def load_tokenizer():
	return AutoTokenizer.from_pretrained(model_checkpoint )
	tokenizer = load_tokenizer()

	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	def load_model():
	return TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
	model = load_model()


	#prompts
	st.title("Tweet Sentiment Extractor...")

	# take text/tweet input
	textbox = st.text_area('Write your text in this box:', '',height=100, max_chars=500 )
	option = st.selectbox(
	'Pick the sentiment',
	('positive', 'negative', 'neutral'))

	python_dict = {"text":[textbox], "sentiment":[option]}

	dataset = Dataset.from_dict(python_dict)

	MAX_LENGTH = 105

	button = st.button('Click here to extract the word/phrase from the text with the given sentiment: {0}..'.format(option))


	if button:
	if not textbox:
	st.markdown("#### " +"Please write something in the above textbox..")
	else:
	with st.spinner('In progress.......'):

	def process_data(examples):
	questions = examples["sentiment"]
	context = examples["text"]
	inputs = tokenizer(
	questions,
	context,
	max_length = MAX_LENGTH,
	padding="max_length",
	return_offsets_mapping = True,
	)
	# Assigning None values to all offset mapping of tokens which are not the context tokens.
	for i in range(len(inputs["input_ids"])):
	offset = inputs["offset_mapping"][i]
	sequence_ids = inputs.sequence_ids(i)
	inputs["offset_mapping"][i] = [
	o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
	]
	return inputs

	processed_raw_data = dataset.map(
	process_data,
	batched = True
	)
	tf_raw_dataset = processed_raw_data.to_tf_dataset(
	columns=["input_ids", "attention_mask"],
	shuffle=False,
	batch_size=1,
	)

	# final predictions.
	outputs = model.predict(tf_raw_dataset)
	start_logits = outputs.start_logits
	end_logits = outputs.end_logits

	# Post Processing.
	# Using start_logits and end_logits to generate the final answer from the given context.
	n_best = 20

	def predict_answers(inputs):
	predicted_answer = []
	for i in range(len(inputs["offset_mapping"])):
	start_logit = inputs["start_logits"][i]
	end_logit = inputs["end_logits"][i]
	context = inputs["text"][i]
	offset = inputs["offset_mapping"][i]
	start_indexes = np.argsort(start_logit)[-1: -n_best - 1:-1].tolist()
	end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()

	flag = False
	for start_index in start_indexes:
	for end_index in end_indexes:
	# skip answer that are not in the context.
	if offset[start_index] is None or offset[end_index] is None:
	continue
	# skip answer with length that is either < 0
	if end_index < start_index:
	continue
	flag = True
	answer = context[offset[start_index][0]: offset[end_index][1]]
	predicted_answer.append(answer)
	break
	if flag:
	break
	if not flag:
	predicted_answer.append(answer)
	return {"predicted_answer":predicted_answer}

	processed_raw_data.set_format("pandas")

	processed_raw_df = processed_raw_data[:]
	processed_raw_df["start_logits"] = start_logits.tolist()
	processed_raw_df["end_logits"] = end_logits.tolist()
	processed_raw_df["text"] = python_dict["text"]

	final_data = Dataset.from_pandas(processed_raw_df)
	final_data = final_data.map(predict_answers,batched=True)



	st.markdown("## " +final_data["predicted_answer"][0])