-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
192 lines (159 loc) · 8.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import cohere, json
import dotenv
import os
dotenv.load_dotenv()
import logging
import requests
import time
import streamlit as st
st.set_page_config(page_title="Cohere AI Deal Analysis", page_icon='🧠')
st.title("Cohere AI Deal Analysis")
st.write("### Create Structured Data Using Cohere AI.")
st.write('''Enter a URL below of a deal memo, funding announcement, or press release about a recent venture capital investment. The app will use [Cohere](https://cohere.com/) to analyze the text and return structured data in JSON format.
''')
with st.sidebar:
st.write('''## About this App
This app uses the Cohere AI API to analyze a adeal announcement from a URL. It will return structured data in JSON format.''')
st.write('''
## Demonstrating Structured Data Extraction
In this example, the model will take in unstructured data from the web, and return a structured JSON object with the data. Specifically, it reads a press release about a recent venture capital investment and and extracts structured data from it, such as the name of the company, the funding round, the investment amount, cofounders, and valuation. This is useful for tracking investments in the startup ecosystem, and for understanding the trends in venture capital funding.
## Why is this interesting?
Data comes from across the web in unstructured format. Organizing the data is the first step in developing better business intelligence.
## A Cohere AI Example
This example uses the [Cohere AI API](https://docs.cohere.com/) backend to perform an analysis on the memo and return a structured JSON response.
## See the Code
You can see the code for this app on [GitHub](https://github.com/johnisanerd/cohere_analyze_press_release)
''')
# Example Deal: https://www.reuters.com/technology/ai-startup-cohere-seeks-5-bln-valuation-latest-fundraising-source-says-2024-03-21/
# Example Deal: "https://techcrunch.com/2023/06/08/ai-startup-cohere-now-valued-at-over-2-1b-raises-270m/"
url_to_scrape = "https://techcrunch.com/2023/06/08/ai-startup-cohere-now-valued-at-over-2-1b-raises-270m/"
# Setup ratelimiting for Jinja AI API
# https://requests-ratelimiter.readthedocs.io
from requests_ratelimiter import LimiterSession
requests_session = LimiterSession(per_minute=20) # Create a rate limiter that allows 20 calls per minute
# This is an example deal we will ask the LLM to analyze the deal memo txt with.
schema_structure = { "type": "json_object",
"schema": {
"type": "object",
"required": ["company_name", "company_description", "funding_type", "startup_vertical"],
"properties": {
'company_name': { "type": "string" },
'company_url': { "type": "string" },
'startup_vertical': { "type": "string" },
'location_country': { "type": "string" },
'location_state': { "type": "string" },
'location_city': { "type": "string" },
'company_description': { "type": "string" },
'founding_year': { "type": "string" },
'funding_type': { "type": "string" },
'funding_amount_millions': { "type": "integer" },
'valuation_millions': { "type": "integer" },
'coinvestors': { "type": "string" },
'premoney_valualtion_millions': { "type": "integer" },
'revenue': { "type": "boolean" },
'prominent_investors_mentioned': { "type": "string" },
'founders': { "type": "string" }
}
}
}
# Load the API key from the .env file
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
co = cohere.Client(api_key=COHERE_API_KEY)
def scrape_jina_ai(url: str) -> str:
'''
Scrape the page using Jinja AI. Return the text.
If it fails, it will return a 404, a 422.
If it successeds, response will be a 200.
Get the status code with response.status_code.
Get the resolved url with response.url.
Get the text response with response.text.
'''
logging.info(f"Scraping url with Jina AI: {url} ")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Referer": "https://www.google.com/",
"Upgrade-Insecure-Requests": "1"
}
# Get the redirect first.
try:
cleaned_response = requests.get(url, allow_redirects=True, headers=headers)
url = cleaned_response.url
except:
pass
requests_session.headers.update(headers)
try:
response = requests_session.get("https://r.jina.ai/" + url, allow_redirects=True, headers=headers, timeout=60)
except:
return False
if response.status_code == 429: # Check if the rate limit has been reached
seconds_to_wait = int(response.headers["Retry-After"]) # Get the number of seconds to wait from the Retry-After header
logging.error(f"Rate limit exceeded for {url}. Waiting {seconds_to_wait} seconds before retrying...") # Print a message
time.sleep(seconds_to_wait) # Wait for the specified time
return scrape_jina_ai(url) # Recursively call the function again until the rate limit is lifted
return response
def analyze_deal(url: str) -> str:
# Get the deal text from the URL
try:
deal = scrape_jina_ai(url_to_scrape)
if deal:
deal_text = deal.text
# st.write("### Deal Text")
# st.write("The following text was scraped from the URL provided: ")
# st.write(deal_text[:1000])
else:
logging.error("Failed to scrape the deal text from the URL.")
st.write("An error occurred while trying to scrape the deal text from the URL.")
return False
except:
logging.error("Failed to scrape the deal text from the URL.")
st.write("An error occurred while trying to scrape the deal text from the URL.")
return False
deal = co.chat(
# Documentation at: https://docs.cohere.com/reference/chat
model="command-r-plus",
preamble="Analyze the following deal text from a press release.",
message=f'''
This is the deal in markdown format: {deal_text}
Return formatted JSON.
''',
response_format=schema_structure
)
logging.info(deal.text)
return deal.text
str_request_text = 'Enter the url of the deal to analyze.'
str_question_text_default = url_to_scrape
str_submit_button_text = 'Strap in, let\'s go!'
str_submit_button_help_text = 'Click to submit for analysis.'
# https://docs.streamlit.io/library/api-reference/widgets/st.text_area
request = st.text_input(str_request_text,
value=str_question_text_default,
max_chars=1000,
key='request-text-area', # unique key for text_area widget help=None,
on_change=None,
args=None,
kwargs=None,
placeholder=None,
disabled=False,
label_visibility="visible"
)
btn = st.button(str_submit_button_text,
key='submit_button',
help=str_submit_button_help_text,
kwargs=None,
type="primary",
disabled=False,
use_container_width=False)
if btn:
st.divider()
logging.info(f"Analyzing the deal from the URL: {request}")
with st.spinner('Processing results . . . '):
analyzed_deal_string = analyze_deal(request)
st.write(f"Finished analyzing the deal!")
st.divider()
analyzed_deal_json = json.loads(analyzed_deal_string) # analyzed_deal_string is a string in JSON format. Prettify it.
# Display the JSON in a pretty format.
st.write("### Structured Data")
st.write(analyzed_deal_json)