├── .env_example
├── .gitignore
├── LICENSE
├── Model
    ├── __init__.py
    ├── invokemodel.py
    └── provider.py
├── README.md
├── agent
    ├── __init__.py
    └── web_agent.py
├── config
    ├── __init__.py
    ├── log.py
    ├── settings.py
    └── user_settings.py
├── configure
    ├── __init__.py
    ├── config_llm.py
    ├── llama.py
    └── vision.py
├── extras
    ├── __init__.py
    ├── pawelzmarlak-2025-01-22T16_41_02.675Z.png
    ├── pawelzmarlak-2025-02-13T06_51_12.773Z.png
    ├── safejsonload.py
    └── viewimage.py
├── main.py
├── memory
    ├── __init__.py
    └── research_mem.py
├── requirements.txt
├── source_reliable
    ├── __init__.py
    └── source_reliability_class.py
├── test
    ├── __init__.py
    ├── ollama.py
    ├── selenium.py
    └── test_model.py
└── tools
    ├── __init__.py
    ├── capture_ss.py
    ├── create_vecstore.py
    ├── extract_urls.py
    ├── fetch_webpage.py
    ├── host_tracker.py
    ├── size_limit.py
    ├── split_doc.py
    ├── topic_into_sub.py
    └── vision_query.py


/.env_example:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY = "API_KEY"
2 | BRAVE_API_KEY = "API_KEY"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | agent_memory.json
  2 | HOSTS.txt
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # UV
101 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #uv.lock
105 | 
106 | # poetry
107 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
109 | #   commonly ignored for libraries.
110 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 | 
113 | # pdm
114 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | #   in version control.
118 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
119 | .pdm.toml
120 | .pdm-python
121 | .pdm-build/
122 | 
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 | 
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # pytype static type analyzer
161 | .pytype/
162 | 
163 | # Cython debug symbols
164 | cython_debug/
165 | 
166 | # PyCharm
167 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
170 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/Model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/Model/__init__.py


--------------------------------------------------------------------------------
/Model/invokemodel.py:
--------------------------------------------------------------------------------
1 | from langchain.schema import HumanMessage, AIMessage
2 | 
3 | def invoke_model(llm, prompt: str) -> AIMessage:
4 |     """Helper to invoke LLM with a single prompt."""
5 |     response = llm([HumanMessage(content=prompt)])
6 |     return response


--------------------------------------------------------------------------------
/Model/provider.py:
--------------------------------------------------------------------------------
1 | class ModelProvider:
2 |     OLLAMA = "ollama"
3 |     GROQ = "groq"
4 | 
5 |     @staticmethod
6 |     def get_provider_choice() -> str:
7 |         return ModelProvider.GROQ


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 🌐 **SurfAgent: The  Web Search and Analysis Agent**
 3 | 
 4 | SurfAgent is a sophisticated **CLI-based agent** built from scratch. Powered by **Selenium** and **Brave Search**, SurfAgent extracts relevant information and even analyzes images related to your queries and **quotes** Sources of that information in the form of links as well, leveraging state-of-the-art **Llama Vision Models**. These models are compatible with **GROQ** or **OLLAMA**, offering seamless integration.
 5 | 
 6 | ---
 7 | 
 8 | ## ✨ **Key Features**
 9 | 
10 | ### 📁 Contextual Memory
11 | SurfAgent remembers and learns from past interactions, storing details in a dynamic memory system. Memory snapshots are available in a JSON format, providing insights like:
12 | ```json
13 | {
14 |     "query_types": "",
15 |     "last_success": "",
16 |     "last_failure": "",
17 |     "total_attempts": "",
18 |     "successful_attempts": "",
19 |     "average_response_time": "",
20 |     "notes": ""
21 | }
22 | ```
23 | 
24 | ### 🚫 Intelligent Host Management
25 | SurfAgent keeps a record of problematic hosts in a `HOSTS.txt` file, ensuring those hosts are avoided in future searches.
26 | 
27 | <img src="extras/pawelzmarlak-2025-01-22T16_41_02.675Z.png" alt="ALT TEXT" width="750">
28 | 
29 | ### 🖼️ Image Analysis
30 | Integrates advanced image processing capabilities by fetching relevant images based on user input or context, then passing these images to **Llama Vision Models**. These models analyze the images to extract key information, such as text, objects, patterns, or any visual data present, and process it to generate meaningful insights or responses.
31 | 
32 | <img src="extras/pawelzmarlak-2025-02-13T06_51_12.773Z.png" alt="ALT TEXT" width="750">
33 | 
34 | ### 🔗 LangChain Integration
35 | Utilizes **LangChain tools** to enhance automation and analytical capabilities.
36 | 
37 | ---
38 | 
39 | ## 🚀 **How to Run SurfAgent**
40 | 
41 | To get started with SurfAgent, follow these simple steps:
42 | 
43 | ### 🛠️ **Step 1: Clone the Repository**
44 | 
45 | First, clone the SurfAgent repository from GitHub to your local machine. Open your terminal or command prompt and run the following command:
46 | 
47 | ```bash
48 | git clone https://github.com/Haseebasif7/SurfAgent.git
49 | ```
50 | 
51 | ### 🛠️ **Step 2: Install the Required Dependencies**
52 | 
53 | Navigate to the `SurfAgent` directory:
54 | 
55 | ```bash
56 | cd SurfAgent
57 | ```
58 | 
59 | Then, install the required libraries by running:
60 | 
61 | ```bash
62 | pip install -r requirements.txt
63 | ```
64 | 
65 | This will install the necessary packages, including **Selenium**, **Brave Search**, **Llama Vision Models**, **LangChain**, and more.
66 | 
67 | ### 🔑 **Step 3: Set Up Environment Variables**
68 | 
69 | Create a `.env` file in the root directory of the SurfAgent project and add your API keys for **GROQ** and **Brave Search**:
70 | 
71 | ```env
72 | GROQ_API_KEY="your_groq_api_key"
73 | BRAVE_API_KEY="your_brave_api_key"
74 | ```
75 | 
76 | Make sure to replace `"your_groq_api_key"` and `"your_brave_api_key"` with your actual API keys.
77 | 
78 | ### 🖥️ **Step 4: Run the SurfAgent**
79 | 
80 | Once the dependencies are installed and environment variables are set, you're ready to run SurfAgent. Execute the following command to start the agent:
81 | 
82 | ```bash
83 | python main.py
84 | ```
85 | 
86 | SurfAgent will now initialize, begin processing web searches, and provide results based on your queries. Enjoy enhanced web search capabilities with intelligent memory, host management, and image analysis!
87 | 
88 | ---
89 | 
90 | ### 🧑‍💻 **Troubleshooting && Contributions**
91 | 
92 | If you encounter any issues during installation or want to add some new feature, feel free to :
93 | 
94 | - Open an Issue or Open an Pull Request to Contribute
95 | 
96 | ---
97 | 


--------------------------------------------------------------------------------
/agent/__init__.py:
--------------------------------------------------------------------------------
1 | ## Made by Haseebasif7


--------------------------------------------------------------------------------
/agent/web_agent.py:
--------------------------------------------------------------------------------
  1 | from memory.research_mem import ResearchMemory
  2 | from tools.host_tracker import host_tracker 
  3 | from config.log import logger
  4 | from typing import Dict, List
  5 | from datetime import datetime
  6 | from urllib.parse import urlparse
  7 | import requests 
  8 | import time
  9 | import json
 10 | import re
 11 | from Model.invokemodel import invoke_model
 12 | from extras.safejsonload import safe_json_loads
 13 | from config.settings import BRAVE_API_KEY
 14 | from tools.extract_urls import extract_urls_from_search_results
 15 | from tools.fetch_webpage import fetch_webpage_content
 16 | 
 17 | class WebAgent:
 18 |     def __init__(self, retriever, llm, prompt, brave_search, wikipedia, provider):
 19 |         self.retriever = retriever
 20 |         self.llm = llm
 21 |         self.prompt = prompt
 22 |         self.brave_search = brave_search
 23 |         self.wikipedia = wikipedia
 24 |         self.provider = provider
 25 |         self.max_retries = 3
 26 |         self.retry_delay = 2
 27 |         self.research_memory = {}
 28 |         self.confidence_threshold = 0.5
 29 |         self.host_tracker = host_tracker 
 30 |         self.current_topic = None
 31 |         self.memory = ResearchMemory()
 32 |         self.current_assessment = None
 33 | 
 34 |     def assess_content_relevance(self, content: str, topic: str) -> Dict:
 35 |         assessment_prompt = f"""You are a content assessment expert. Analyze this content's relevance and completeness for the given topic.
 36 |         Consider:
 37 |         1. How directly it answers the topic/question
 38 |         2. The specificity and accuracy of information
 39 |         3. Whether it provides context and supporting details
 40 |         4. The currentness and reliability of the information
 41 |         
 42 |         Topic: {topic}
 43 |         Content length: {len(content)} characters
 44 |         First 1000 chars: {content[:1000]}
 45 |         
 46 |         You must respond with ONLY a JSON object in this exact format:
 47 |         {{
 48 |             "relevance": <number between 0-1>,
 49 |             "is_complete": <true or false>,
 50 |             "found_data": "<key information found>",
 51 |             "needs_verification": <true or false>,
 52 |             "needs_context": <true or false>,
 53 |             "confidence": <number between 0-1>
 54 |         }}"""
 55 |         
 56 |         try:
 57 |             response = invoke_model(self.llm, assessment_prompt)
 58 |             response_text = response.content.strip()
 59 |             json_match = re.search(r'\{[\s\S]*?\}', response_text)
 60 |             if not json_match:
 61 |                 return {
 62 |                     'relevance': 0.5 if len(content) > 100 else 0.0,
 63 |                     'is_complete': False,
 64 |                     'found_data': content[:200] if len(content) > 0 else '',
 65 |                     'needs_verification': True,
 66 |                     'needs_context': True,
 67 |                     'confidence': 0.3
 68 |                 }
 69 |             json_str = json_match.group(0)
 70 |             fallback = {
 71 |                 'relevance': 0.5 if len(content) > 100 else 0.0,
 72 |                 'is_complete': False,
 73 |                 'found_data': content[:200] if len(content) > 0 else '',
 74 |                 'needs_verification': True,
 75 |                 'needs_context': True,
 76 |                 'confidence': 0.3
 77 |             }
 78 |             result = safe_json_loads(json_str, fallback, content)
 79 |             return {
 80 |                 'relevance': float(result.get('relevance', 0)),
 81 |                 'is_complete': bool(result.get('is_complete', False)),
 82 |                 'found_data': str(result.get('found_data', '')),
 83 |                 'needs_verification': bool(result.get('needs_verification', True)),
 84 |                 'needs_context': bool(result.get('needs_context', True)),
 85 |                 'confidence': float(result.get('confidence', 0))
 86 |             }
 87 |         except Exception as e:
 88 |             logger.error(f"Error in content assessment: {str(e)}")
 89 |             return {
 90 |                 'relevance': 0.0,
 91 |                 'is_complete': False,
 92 |                 'found_data': '',
 93 |                 'needs_verification': True,
 94 |                 'needs_context': True,
 95 |                 'confidence': 0.0
 96 |             }
 97 | 
 98 |     def extract_key_information(self, content: str, topic: str) -> Dict:
 99 |         extraction_prompt = f"""You are a precise information extractor. Extract key information from the content that is relevant to the topic.
100 |         You must respond in valid JSON format with exactly these fields:
101 |         {{
102 |             "main_facts": [list of key facts as strings],
103 |             "confidence": number between 0.0-1.0,
104 |             "timestamp": string or null,
105 |             "source_quality": number between 0.0-1.0
106 |         }}
107 | 
108 |         Topic: {topic}
109 |         Content: {content}
110 | 
111 |         Respond ONLY with the JSON object, no other text:"""
112 | 
113 |         try:
114 |             response = invoke_model(self.llm, extraction_prompt)
115 |             response_text = response.content.strip()
116 |             json_match = re.search(r'\{[\s\S]*?\}', response_text)
117 |             if not json_match:
118 |                 return {
119 |                     "main_facts": ["Unable to extract structured information from source"],
120 |                     "confidence": 0.0,
121 |                     "timestamp": None,
122 |                     "source_quality": 0.0
123 |                 }
124 |             json_str = json_match.group(0)
125 |             fallback = {
126 |                 "main_facts": ["Unable to extract structured information from source"],
127 |                 "confidence": 0.0,
128 |                 "timestamp": None,
129 |                 "source_quality": 0.0
130 |             }
131 |             info = safe_json_loads(json_str, fallback, content)
132 |             
133 |             if not isinstance(info.get('main_facts', []), list):
134 |                 info['main_facts'] = [str(info.get('main_facts', ''))]
135 |             
136 |             return {
137 |                 'main_facts': info.get('main_facts', []),
138 |                 'confidence': min(max(info.get('confidence', 0.0), 0.0), 1.0),
139 |                 'timestamp': info.get('timestamp'),
140 |                 'source_quality': min(max(info.get('source_quality', 0.0), 0.0), 1.0)
141 |             }
142 |         except Exception as e:
143 |             logger.error(f"Error extracting information: {str(e)}")
144 |             return {
145 |                 "main_facts": ["Unable to extract structured information from source"],
146 |                 "confidence": 0.0,
147 |                 "timestamp": None,
148 |                 "source_quality": 0.0
149 |             }
150 | 
151 |     def assess_question_complexity(self, topic: str) -> float:
152 |         complexity_prompt = f"""
153 |         Analyze the complexity of this research topic/question.
154 |         Rate from 0.0 to 1.0, where:
155 |         - 0.0: Very simple
156 |         - 0.3: Basic fact-finding
157 |         - 0.6: Moderate complexity
158 |         - 1.0: Complex analysis
159 |         
160 |         Topic: {topic}
161 |         
162 |         Respond with only a number between 0.0 and 1.0:"""
163 |         
164 |         try:
165 |             response = invoke_model(self.llm, complexity_prompt)
166 |             matches = re.findall(r"0?\.[0-9]+", response.content)
167 |             if matches:
168 |                 rating = float(matches[0])
169 |             else:
170 |                 rating = 0.5
171 |             return min(max(rating, 0.0), 1.0)
172 |         except Exception as e:
173 |             logger.error(f"Error assessing question complexity: {str(e)}")
174 |             return 0.5
175 | 
176 |     def _check_information_consistency(self, facts: List[str]) -> bool:
177 |         return True
178 | 
179 |     def should_continue_research(self, topic: str, current_source: Dict) -> Dict:
180 |         if topic not in self.research_memory:
181 |             return {"continue": True, "reason": "No research started yet"}
182 | 
183 |         findings = self.research_memory[topic]
184 |         sources_count = len(findings['sources'])
185 |         complexity = self.assess_question_complexity(topic)
186 |         
187 |         min_sources = max(2, int(complexity * 5))
188 |         quality_threshold = 0.7 + (complexity * 0.2)
189 |         high_quality_sources = sum(1 for s in findings['sources'] 
190 |                                    if s.get('relevance', 0) > quality_threshold 
191 |                                    and s.get('confidence', 0) > quality_threshold)
192 |         
193 |         if high_quality_sources >= min_sources:
194 |             return {"continue": False, "reason": "Sufficient high-quality sources found"}
195 |         
196 |         max_sources = min_sources * 2
197 |         if sources_count >= max_sources:
198 |             return {"continue": False, "reason": "Maximum sources reached"}
199 |         
200 |         if sources_count > 1:
201 |             info_consistent = self._check_information_consistency(findings['main_facts'])
202 |             if not info_consistent:
203 |                 return {"continue": True, "reason": "Found inconsistent information", "priority": "verification"}
204 |         
205 |         if sources_count > 0:
206 |             latest_source = findings['sources'][-1]
207 |             if latest_source.get('needs_verification', True):
208 |                 return {"continue": True, "reason": "Need verification", "priority": "verification"}
209 |             if latest_source.get('needs_context', True) and complexity > 0.5:
210 |                 return {"continue": True, "reason": "Need context", "priority": "context"}
211 |         
212 |         return {"continue": True, "reason": "Need more information"}
213 | 
214 |     def brave_search_run(self, query: str, retries: int = 3) -> str:
215 |         if not BRAVE_API_KEY:
216 |             logger.error("Brave Search API key not set. Unable to perform search.")
217 |             return ""
218 |         for i in range(retries):
219 |             try:
220 |                 return self.brave_search.run(query)
221 |             except requests.HTTPError as e:
222 |                 if e.response.status_code == 429:
223 |                     logger.warning("Hit rate limit. Waiting before retry...")
224 |                     time.sleep((i+1)*2)
225 |                 else:
226 |                     logger.error(f"HTTP Error during Brave search: {str(e)}")
227 |                     time.sleep(2)
228 |             except Exception as ex:
229 |                 logger.error(f"Error in Brave search: {str(ex)}")
230 |                 time.sleep(2)
231 |         return ""
232 | 
233 |     def fetch_additional_info(self, topic: str) -> str:
234 |         self.current_topic = topic
235 |         query_type = self.memory.categorize_query(topic)
236 |         
237 |         if topic not in self.research_memory:
238 |             self.research_memory[topic] = {
239 |                 'sources': [],
240 |                 'main_facts': [],
241 |                 'last_update': time.time(),
242 |                 'visited_urls': set()
243 |             }
244 | 
245 |         all_research = []
246 |         research_status = {"continue": True, "reason": "Initial research"}
247 |         
248 |         if query_type == 'stock_price':
249 |             priority_domains = [
250 |                 'marketwatch.com',
251 |                 'finance.yahoo.com',
252 |                 'bloomberg.com',
253 |                 'reuters.com'
254 |             ]
255 |             
256 |             for domain in priority_domains:
257 |                 if any(domain in s.get('url', '') for s in self.research_memory[topic]['sources']):
258 |                     continue
259 |                     
260 |                 search_query = f"site:{domain} {topic}"
261 |                 search_results = self.brave_search_run(search_query)
262 |                 urls = extract_urls_from_search_results(search_results)
263 |                 
264 |                 if urls:
265 |                     url = urls[0]
266 |                     if url not in self.research_memory[topic]['visited_urls']:
267 |                         self.research_memory[topic]['visited_urls'].add(url)
268 |                         content = fetch_webpage_content(url, self.provider, topic)
269 |                         
270 |                         assessment = self.assess_content_relevance(content, topic)
271 |                         if assessment['relevance'] > 0.7:
272 |                             info = self.extract_key_information(content, topic)
273 |                             current_source = {**assessment, **info}
274 |                             
275 |                             self.research_memory[topic]['sources'].append({
276 |                                 'url': url,
277 |                                 'content': content,
278 |                                 **current_source
279 |                             })
280 |                             self.research_memory[topic]['main_facts'].extend(info['main_facts'])
281 |                             
282 |                             if assessment['relevance'] > 0.8 and assessment['confidence'] > 0.8:
283 |                                 research_status = {"continue": False, "reason": "Found reliable stock price"}
284 |                                 break
285 | 
286 |         search_attempts = 0
287 |         max_search_attempts = 3
288 |         
289 |         while research_status["continue"] and search_attempts < max_search_attempts:
290 |             try:
291 |                 if search_attempts == 0:
292 |                     search_query = topic
293 |                 elif search_attempts == 1:
294 |                     search_query = f"{topic} latest information"
295 |                 else:
296 |                     search_query = f"{topic} current data {datetime.now().strftime('%Y')}"
297 | 
298 |                 if research_status.get("priority") == "verification":
299 |                     search_query += " facts verify source"
300 |                 elif research_status.get("priority") == "context":
301 |                     search_query += " background context"
302 |                 
303 |                 logger.info(f"Searching with query: {search_query}")
304 |                 results = self.brave_search_run(search_query)
305 |                 urls = extract_urls_from_search_results(results)
306 |                 
307 |                 urls = [url for url in urls if url not in self.research_memory[topic]['visited_urls']]
308 |                 
309 |                 if not urls:
310 |                     search_attempts += 1
311 |                     continue
312 |                 
313 |                 urls = self.memory.prioritize_urls(urls, topic)
314 |                 
315 |                 for url in urls[:2]:
316 |                     if url in self.research_memory[topic]['visited_urls']:
317 |                         continue
318 |                         
319 |                     self.research_memory[topic]['visited_urls'].add(url)
320 |                     start_time = time.time()
321 |                     content = fetch_webpage_content(url, self.provider, topic)
322 |                     response_time = time.time() - start_time
323 |                     
324 |                     assessment = self.assess_content_relevance(content, topic)
325 |                     domain = urlparse(url).netloc
326 |                     
327 |                     success = assessment['relevance'] > 0.5
328 |                     self.memory.update_source_reliability(
329 |                         domain=domain,
330 |                         query_type=query_type,
331 |                         success=success,
332 |                         response_time=response_time,
333 |                         content_quality=assessment['relevance']
334 |                     )
335 |                     
336 |                     if success:
337 |                         info = self.extract_key_information(content, topic)
338 |                         current_source = {**assessment, **info}
339 |                         
340 |                         self.research_memory[topic]['sources'].append({
341 |                             'url': url,
342 |                             'content': content,
343 |                             **current_source
344 |                         })
345 |                         self.research_memory[topic]['main_facts'].extend(info['main_facts'])
346 |                         
347 |                         research_status = self.should_continue_research(topic, current_source)
348 |                         logger.info(f"Research status: {research_status['reason']}")
349 |                         
350 |                         if not research_status["continue"]:
351 |                             break
352 |                 
353 |                 if not research_status["continue"]:
354 |                     break
355 |                     
356 |                 search_attempts += 1
357 |                 
358 |             except Exception as e:
359 |                 logger.error(f"Error in research iteration: {str(e)}")
360 |                 search_attempts += 1
361 | 
362 |         all_research.append(f"""
363 |         === Research Summary ===
364 |         Query Type: {query_type}
365 |         Total Sources: {len(self.research_memory[topic]['sources'])}
366 |         Key Facts Found: {json.dumps(self.research_memory[topic]['main_facts'], indent=2)}
367 |         Sources: {json.dumps([{
368 |             'url': s['url'],
369 |             'relevance': s.get('relevance', 0),
370 |             'confidence': s.get('confidence', 0),
371 |             'found_data': s.get('found_data', '')
372 |         } for s in self.research_memory[topic]['sources']], indent=2)}
373 |         """)
374 | 
375 |         return "\n\n".join(all_research)
376 | 
377 |     def generate_report(self, topic: str) -> str:
378 |         additional_info = self.fetch_additional_info(topic)
379 |         
380 |         enhanced_prompt = f"""
381 |         Generate a  report based on the research findings.
382 |         Focus on the most relevant and current information.
383 |         
384 |         Topic: {topic}
385 |         Research Findings: {additional_info}
386 |         
387 |         Guidelines:
388 |         1. Prioritize information from high-quality sources
389 |         2. Include specific, factual information
390 |         3. Note any significant gaps or uncertainties
391 |         4. Cite sources where appropriate
392 |         
393 |         Report:"""
394 |         
395 |         for attempt in range(self.max_retries):
396 |             try:
397 |                 report = invoke_model(self.llm, enhanced_prompt)
398 |                 return report.content
399 |             except Exception as e:
400 |                 if attempt == self.max_retries - 1:
401 |                     return f"Error generating report: {str(e)}"
402 |                 time.sleep(self.retry_delay)
403 | 
404 |     def assess_research_accuracy(self, topic: str, research_data: Dict) -> Dict:
405 |         assessment_prompt = f"""Analyze the research results for accuracy and completeness.
406 |         Consider:
407 |         1. Consistency across sources
408 |         2. Data freshness and relevance
409 |         3. Source reliability
410 |         4. Information completeness
411 |         
412 |         Topic: {topic}
413 |         Research Data: {json.dumps(research_data, indent=2)}
414 |         
415 |         Respond with JSON:
416 |         {{
417 |             "is_accurate": boolean,
418 |             "confidence": float,
419 |             "completeness": float,
420 |             "concerns": [string],
421 |             "verification_needed": boolean
422 |         }}"""
423 |         
424 |         try:
425 |             response = invoke_model(self.llm, assessment_prompt)
426 |             json_match = re.search(r'\{[\s\S]*?\}', response.content)
427 |             if json_match:
428 |                 assessment = safe_json_loads(json_match.group(0), {
429 |                     "is_accurate": False,
430 |                     "confidence": 0.0,
431 |                     "completeness": 0.0,
432 |                     "concerns": ["Assessment failed"],
433 |                     "verification_needed": True
434 |                 })
435 |             else:
436 |                 assessment = {
437 |                     "is_accurate": False,
438 |                     "confidence": 0.0,
439 |                     "completeness": 0.0,
440 |                     "concerns": ["No JSON returned"],
441 |                     "verification_needed": True
442 |                 }
443 |             self.current_assessment = assessment
444 |             return assessment
445 |         except Exception as e:
446 |             logger.error(f"Error in research assessment: {str(e)}")
447 |             return {
448 |                 "is_accurate": False,
449 |                 "confidence": 0.0,
450 |                 "completeness": 0.0,
451 |                 "concerns": ["Assessment failed"],
452 |                 "verification_needed": True
453 |             }
454 |     
455 |     def record_human_feedback(self, topic: str, is_accurate: bool, notes: str = None):
456 |         if not self.current_assessment:
457 |             logger.error("No current research assessment available")
458 |             return
459 |             
460 |         sources = [s['url'] for s in self.research_memory.get(topic, {}).get('sources', [])]
461 |         self.memory.record_feedback(
462 |             topic=topic,
463 |             sources=sources,
464 |             agent_assessment=self.current_assessment,
465 |             human_feedback=is_accurate,
466 |             notes=notes
467 |         )
468 |         
469 |         self.current_assessment = None


--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/config/__init__.py


--------------------------------------------------------------------------------
/config/log.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logging.basicConfig(level=logging.INFO)
4 | logger = logging.getLogger(__name__)


--------------------------------------------------------------------------------
/config/settings.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 | 
4 | load_dotenv()
5 | 
6 | GROQ_API_KEY = os.getenv("GROQ_API_KEY")
7 | BRAVE_API_KEY = os.getenv("BRAVE_API_KEY")


--------------------------------------------------------------------------------
/config/user_settings.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import pytz
 3 | import locale
 4 | import geocoder
 5 | from dataclasses import dataclass
 6 | from config.log import logger
 7 | 
 8 | @dataclass
 9 | class UserSettings:
10 |     country: str
11 |     timezone: str
12 |     locale: str
13 |     current_time: datetime
14 |     currency: str
15 |     
16 |     @classmethod
17 |     def auto_detect(cls) -> 'UserSettings':
18 |         """Automatically detect user settings dynamically."""
19 |         try:
20 |             # Detect country using geolocation
21 |             location = geocoder.ip('me')
22 |             country = location.country if location and location.country else "Unknown"
23 |             
24 |             # Detect timezone using geolocation
25 |             if location and location.latlng:
26 |                 country_timezones = pytz.country_timezones.get(location.country, [])
27 |                 local_tz = pytz.timezone(country_timezones[0]) if country_timezones else pytz.UTC
28 |             else:
29 |                 local_tz = pytz.UTC
30 |             
31 |             # Get current time in detected timezone
32 |             current_time = datetime.now(local_tz)
33 |             
34 |             # Detect locale based on country
35 |             if country != "Unknown":
36 |                 locale_str = f'en_{country.upper()}.UTF-8'
37 |             else:
38 |                 locale_str = 'en_US.UTF-8'
39 |             locale.setlocale(locale.LC_ALL, locale_str)
40 |             
41 |             # Set currency based on country
42 |             country_currency_map = {
43 |                 'US': 'USD',
44 |                 'CA': 'CAD',
45 |                 'GB': 'GBP',
46 |                 'AU': 'AUD',
47 |                 'IN': 'INR',
48 |                 # Add more mappings as needed
49 |             }
50 |             currency = country_currency_map.get(country, 'USD')  # Default to USD
51 |             
52 |             logger.info(f"Detected user settings: Country={country}, Timezone={local_tz}, Current time={current_time}")
53 |             
54 |             return cls(
55 |                 country=country,
56 |                 timezone=str(local_tz),
57 |                 locale=locale_str,
58 |                 current_time=current_time,
59 |                 currency=currency
60 |             )
61 |         except Exception as e:
62 |             logger.error(f"Error detecting user settings: {str(e)}")
63 |             return cls(
64 |                 country="Unknown",
65 |                 timezone="UTC",
66 |                 locale="en_US.UTF-8",
67 |                 current_time=datetime.now(pytz.UTC),
68 |                 currency="USD"
69 |             )


--------------------------------------------------------------------------------
/configure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/configure/__init__.py


--------------------------------------------------------------------------------
/configure/config_llm.py:
--------------------------------------------------------------------------------
 1 | from langchain_groq import ChatGroq
 2 | from langchain_ollama import ChatOllama
 3 | from Model.provider import ModelProvider
 4 | from typing import Union
 5 | from config.settings import GROQ_API_KEY
 6 | 
 7 | def configure_llm(provider: str) -> Union[ChatOllama, ChatGroq]:
 8 |     """Configure LLM based on selected provider."""
 9 |     if provider == ModelProvider.OLLAMA:
10 |         return ChatOllama(
11 |             model="llama3.2:3b-instruct-q8_0",
12 |             base_url="http://localhost:11434",
13 |             temperature=0.5,
14 |             num_gpu=1,
15 |             num_thread=8
16 |         )
17 |     else:
18 |         return ChatGroq(
19 |             model="llama-3.3-70b-specdec",
20 |             temperature=0.5,
21 |             groq_api_key=GROQ_API_KEY
22 |         )


--------------------------------------------------------------------------------
/configure/llama.py:
--------------------------------------------------------------------------------
 1 | from Model.provider import ModelProvider
 2 | from langchain.prompts import PromptTemplate
 3 | from configure.config_llm import configure_llm
 4 | from config.log import logger
 5 | import sys
 6 | from test.test_model import test_model_provider
 7 | 
 8 | def configure_llama():
 9 |     """Configure model and prompt based on user's choice."""
10 |     provider = ModelProvider.get_provider_choice()
11 |     if not test_model_provider(provider):
12 |         logger.error(f"Failed to initialize {provider} models")
13 |         sys.exit(1)
14 |     llm = configure_llm(provider)
15 |     prompt = PromptTemplate(
16 |         template="""You are an assistant for research tasks. Use the following documents to provide a comprehensive and concise report on the topic. Ensure the report is self-contained with all necessary information.
17 | 
18 |         Topic: {topic}
19 |         Documents: {documents}
20 |         Report: """,
21 |         input_variables=["topic", "documents"],
22 |     )
23 |     return llm, prompt, provider


--------------------------------------------------------------------------------
/configure/vision.py:
--------------------------------------------------------------------------------
 1 | from langchain_groq import ChatGroq
 2 | from langchain_ollama import ChatOllama
 3 | from Model.provider import ModelProvider
 4 | from typing import Union
 5 | from config.settings import GROQ_API_KEY
 6 | 
 7 | def configure_vision_model(provider: str) -> Union[ChatOllama, ChatGroq]:
 8 |     """Configure vision model based on selected provider."""
 9 |     if provider == ModelProvider.OLLAMA:
10 |         return ChatOllama(
11 |             model="llama3.2-vision:11b",
12 |             base_url="http://localhost:11434",
13 |             temperature=0.5,
14 |             num_gpu=1,
15 |             num_thread=8,
16 |             madvise=True,
17 |             f16=True
18 |         )
19 |     else:
20 |         return ChatGroq(
21 |             model="llama-3.2-90b-vision-preview",
22 |             temperature=0.5,
23 |             groq_api_key=GROQ_API_KEY
24 |         )


--------------------------------------------------------------------------------
/extras/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/extras/__init__.py


--------------------------------------------------------------------------------
/extras/pawelzmarlak-2025-01-22T16_41_02.675Z.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/extras/pawelzmarlak-2025-01-22T16_41_02.675Z.png


--------------------------------------------------------------------------------
/extras/pawelzmarlak-2025-02-13T06_51_12.773Z.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/extras/pawelzmarlak-2025-02-13T06_51_12.773Z.png


--------------------------------------------------------------------------------
/extras/safejsonload.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | import json
 3 | from config.log import logger
 4 | 
 5 | def safe_json_loads(json_str: str, fallback: Dict, content: str = "") -> Dict:
 6 |     try:
 7 |         return json.loads(json_str)
 8 |     except json.JSONDecodeError:
 9 |         fixed = json_str.replace('""', '"')
10 |         try:
11 |             return json.loads(fixed)
12 |         except json.JSONDecodeError:
13 |             logger.error("Still can't parse JSON after fix.")
14 |             return fallback


--------------------------------------------------------------------------------
/extras/viewimage.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | import json
 3 | from config.log import logger
 4 | 
 5 | def safe_json_loads(json_str: str, fallback: Dict, content: str = "") -> Dict:
 6 |     try:
 7 |         return json.loads(json_str)
 8 |     except json.JSONDecodeError:
 9 |         fixed = json_str.replace('""', '"')
10 |         try:
11 |             return json.loads(fixed)
12 |         except json.JSONDecodeError:
13 |             logger.error("Still can't parse JSON after fix.")
14 |             return fallback


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from config.log import logger
  2 | from test.selenium import test_selenium
  3 | import sys
  4 | from Model.provider import ModelProvider
  5 | from test.test_model import test_model_provider
  6 | from configure.llama import configure_llama
  7 | from config.settings import BRAVE_API_KEY
  8 | from langchain_community.tools import BraveSearch, WikipediaQueryRun
  9 | from langchain_community.utilities.wikipedia import WikipediaAPIWrapper
 10 | from agent.web_agent import WebAgent
 11 | import time
 12 | import warnings
 13 | 
 14 | warnings.filterwarnings("ignore")
 15 | 
 16 | def print_banner():
 17 |     banner = """
 18 | \033[92m
 19 |  ____  _  _  ____  ____     __    ___  ____  __ _  ____ 
 20 | / ___)/ )( \(  _ \(  __)   / _\  / __)(  __)(  ( \(_  _)
 21 | \___ \) \/ ( )   / ) _)   /    \( (_ \ ) _) /    /  )(  
 22 | (____/\____/(__\_)(__)    \_/\_/ \___/(____)\_)__) (__) 
 23 | \033[0m
 24 | """
 25 |     print(banner)
 26 |     print("\033[92mSurf: Your Intelligent Web Companion!\033[0m\n")
 27 | 
 28 | def print_separator():
 29 |     print("=" * 80)
 30 | 
 31 | def main():
 32 |     print_banner()
 33 |     print("Initializing SurfAgent...")
 34 |     time.sleep(1)
 35 |     
 36 |     if not test_selenium():
 37 |         logger.error("Selenium service check failed")
 38 |         print("Error: Selenium service is not working. Exiting.")
 39 |         sys.exit(1)
 40 |     
 41 |     provider = input("Choose model provider (ollama/groq): ").lower().strip()
 42 |     if provider not in [ModelProvider.OLLAMA, ModelProvider.GROQ]:
 43 |         logger.error("Invalid provider choice")
 44 |         print("Error: Invalid provider. Exiting.")
 45 |         sys.exit(1)
 46 |     
 47 |     if not test_model_provider(provider):
 48 |         logger.error(f"Model provider {provider} check failed")
 49 |         print(f"Error: Model provider {provider} is not functioning correctly. Exiting.")
 50 |         sys.exit(1)
 51 |     
 52 |     llm, prompt, provider = configure_llama()
 53 |     
 54 |     if not BRAVE_API_KEY:
 55 |         logger.warning("Brave Search API key not set. Searches will not return results.")
 56 |         print("Warning: Brave Search API key not found. Limited functionality.")
 57 |     
 58 |     brave_search = BraveSearch.from_api_key(
 59 |         api_key=BRAVE_API_KEY,
 60 |         search_kwargs={"count": 6}
 61 |     )
 62 |     wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
 63 |     retriever = None
 64 |     agent = WebAgent(retriever, llm, prompt, brave_search, wikipedia, provider)
 65 |     
 66 |     print("SurfAgent is ready to assist you! 🚀\n")
 67 |     print_separator()
 68 |     
 69 |     while True:
 70 |         try:
 71 |             topic = input("\n🌐 Enter a topic for web search (or type 'quit' to exit): ").strip()
 72 |             if topic.lower() == 'quit':
 73 |                 print("\n👋 Thank you for using SurfAgent. Goodbye!")
 74 |                 break
 75 |             
 76 |             if not topic:
 77 |                 print("⚠️ Please enter a valid topic.")
 78 |                 continue
 79 |             logger.info(f"Starting research for topic: {topic}")
 80 |             print(f"\n🔍 Researching: {topic}...")
 81 |             report = agent.generate_report(topic)
 82 |             
 83 |             print("\n📜 Response:")
 84 |             print_separator()
 85 |             print(report)
 86 |             print_separator()
 87 |             
 88 |             feedback = input("\n✅ Was this information accurate? (y/n): ").lower().strip()
 89 |             if feedback in ['y', 'n']:
 90 |                 is_accurate = feedback == 'y'
 91 |                 notes = input("📝 Any additional notes? (Enter to skip): ").strip()
 92 |                 agent.record_human_feedback(topic, is_accurate, notes if notes else None)
 93 |                 print("🙏 Thank you for your feedback!")
 94 |             
 95 |         except KeyboardInterrupt:
 96 |             print("\n🛑 Research interrupted by user.")
 97 |             break
 98 |         except Exception as e:
 99 |             logger.error(f"Error during research: {str(e)}")
100 |             print("⚠️ An error occurred. Please try again.")
101 |             continue
102 | 
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/memory/__init__.py


--------------------------------------------------------------------------------
/memory/research_mem.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import re
  4 | from datetime import datetime, timezone
  5 | from urllib.parse import urlparse
  6 | from source_reliable.source_reliability_class import SourceReliability
  7 | from config.log import logger
  8 | from typing import Dict, List
  9 | 
 10 | 
 11 | class ResearchMemory:
 12 |     def __init__(self, memory_file="agent_memory.json"):
 13 |         self.memory_file = memory_file
 14 |         self.source_reliability = {}
 15 |         self.query_patterns = {}
 16 |         self.feedback_history = {}
 17 |         self.load_memory()
 18 |     
 19 |     def load_memory(self):
 20 |         try:
 21 |             if os.path.exists(self.memory_file):
 22 |                 with open(self.memory_file, 'r') as f:
 23 |                     data = json.load(f)
 24 |                     
 25 |                     for domain, info in data.get('sources', {}).items():
 26 |                         self.source_reliability[domain] = SourceReliability(
 27 |                             domain=domain,
 28 |                             query_types=info.get('query_types', {}),
 29 |                             last_success=datetime.fromisoformat(info['last_success']) if info.get('last_success') else None,
 30 |                             last_failure=datetime.fromisoformat(info['last_failure']) if info.get('last_failure') else None,
 31 |                             total_attempts=info.get('total_attempts', 0),
 32 |                             successful_attempts=info.get('successful_attempts', 0),
 33 |                             average_response_time=info.get('average_response_time', 0.0),
 34 |                             notes=info.get('notes', [])
 35 |                         )
 36 |                     
 37 |                     self.query_patterns = data.get('query_patterns', {})
 38 |                     self.feedback_history = data.get('feedback_history', {})
 39 |                     
 40 |                 logger.info(f"Loaded research memory with {len(self.source_reliability)} sources and {len(self.feedback_history)} feedback entries")
 41 |         except Exception as e:
 42 |             logger.error(f"Error loading research memory: {str(e)}")
 43 |             self.source_reliability = {}
 44 |             self.query_patterns = {}
 45 |             self.feedback_history = {}
 46 |     
 47 |     def save_memory(self):
 48 |         try:
 49 |             data = {
 50 |                 'sources': {
 51 |                     domain: {
 52 |                         'query_types': info.query_types,
 53 |                         'last_success': info.last_success.isoformat() if info.last_success else None,
 54 |                         'last_failure': info.last_failure.isoformat() if info.last_failure else None,
 55 |                         'total_attempts': info.total_attempts,
 56 |                         'successful_attempts': info.successful_attempts,
 57 |                         'average_response_time': info.average_response_time,
 58 |                         'notes': info.notes
 59 |                     }
 60 |                     for domain, info in self.source_reliability.items()
 61 |                 },
 62 |                 'query_patterns': self.query_patterns,
 63 |                 'feedback_history': self.feedback_history
 64 |             }
 65 |             
 66 |             with open(self.memory_file, 'w') as f:
 67 |                 json.dump(data, f, indent=2)
 68 |                 
 69 |             logger.info("Successfully saved research memory")
 70 |         except Exception as e:
 71 |             logger.error(f"Error saving research memory: {str(e)}")
 72 |     
 73 |     def categorize_query(self, query: str) -> str:
 74 |         categories = {
 75 |             'stock_price': r'(?i)(stock|share)\s+price|price\s+of\s+stock',
 76 |             'financial_data': r'(?i)financial|revenue|earnings|profit|market\s+cap',
 77 |             'company_info': r'(?i)headquarters|ceo|founded|employees|about',
 78 |             'news': r'(?i)news|latest|recent|update|announce',
 79 |             'technical': r'(?i)technology|software|product|service|api',
 80 |             'general': r'.*'
 81 |         }
 82 |         
 83 |         for category, pattern in categories.items():
 84 |             if re.search(pattern, query):
 85 |                 return category
 86 |         return 'general'
 87 |     
 88 |     def update_source_reliability(self, domain: str, query_type: str, success: bool, response_time: float, content_quality: float):
 89 |         if domain not in self.source_reliability:
 90 |             self.source_reliability[domain] = SourceReliability(
 91 |                 domain=domain,
 92 |                 query_types={},
 93 |                 last_success=None,
 94 |                 last_failure=None,
 95 |                 total_attempts=0,
 96 |                 successful_attempts=0,
 97 |                 average_response_time=0.0,
 98 |                 notes=[]
 99 |             )
100 |         
101 |         source = self.source_reliability[domain]
102 |         current_time = datetime.now(timezone.utc)
103 |         
104 |         if query_type not in source.query_types:
105 |             source.query_types[query_type] = 0.0
106 |         
107 |         source.total_attempts += 1
108 |         if success:
109 |             source.successful_attempts += 1
110 |             source.last_success = current_time
111 |             source.query_types[query_type] = (
112 |                 source.query_types[query_type] * 0.9 +
113 |                 content_quality * 0.1
114 |             )
115 |         else:
116 |             source.last_failure = current_time
117 |             source.query_types[query_type] *= 0.9
118 |         
119 |         source.average_response_time = (
120 |             source.average_response_time * 0.9 +
121 |             response_time * 0.1
122 |         )
123 |         
124 |         self.save_memory()
125 |     
126 |     def get_best_sources(self, query_type: str, min_reliability: float = 0.3) -> List[str]:
127 |         relevant_sources = []
128 |         
129 |         for domain, source in self.source_reliability.items():
130 |             reliability = source.query_types.get(query_type, 0.0)
131 |             if reliability >= min_reliability:
132 |                 relevant_sources.append((domain, reliability))
133 |         
134 |         relevant_sources.sort(key=lambda x: x[1], reverse=True)
135 |         return [domain for domain, _ in relevant_sources]
136 |     
137 |     def prioritize_urls(self, urls: List[str], query: str) -> List[str]:
138 |         query_type = self.categorize_query(query)
139 |         self.get_best_sources(query_type)
140 |         
141 |         scored_urls = []
142 |         for url in urls:
143 |             domain = urlparse(url).netloc
144 |             source = self.source_reliability.get(domain)
145 |             
146 |             if source:
147 |                 reliability = source.query_types.get(query_type, 0.0)
148 |                 success_rate = source.successful_attempts / max(1, source.total_attempts)
149 |                 response_speed = 1.0 / (1.0 + source.average_response_time)
150 |                 score = (reliability * 0.5 +
151 |                         success_rate * 0.3 +
152 |                         response_speed * 0.2)
153 |             else:
154 |                 score = 0.1
155 |             
156 |             scored_urls.append((url, score))
157 |         
158 |         scored_urls.sort(key=lambda x: x[1], reverse=True)
159 |         return [url for url, _ in scored_urls]
160 |     
161 |     def record_feedback(self, topic: str, sources: List[str], agent_assessment: Dict, human_feedback: bool, notes: str = None):
162 |         current_time = datetime.now(timezone.utc)
163 |         query_type = self.categorize_query(topic)
164 |         
165 |         feedback_entry = {
166 |             'timestamp': current_time.isoformat(),
167 |             'topic': topic,
168 |             'sources': sources,
169 |             'agent_assessment': agent_assessment,
170 |             'human_feedback': human_feedback,
171 |             'query_type': query_type,
172 |             'notes': notes
173 |         }
174 |         
175 |         if topic not in self.feedback_history:
176 |             self.feedback_history[topic] = []
177 |         self.feedback_history[topic].append(feedback_entry)
178 |         
179 |         agent_confidence = agent_assessment.get('confidence', 0.0)
180 |         agent_correct = agent_assessment.get('is_accurate', False)
181 |         
182 |         for source in sources:
183 |             domain = urlparse(source).netloc
184 |             if domain not in self.source_reliability:
185 |                 continue
186 |                 
187 |             source_info = self.source_reliability[domain]
188 |             
189 |             if human_feedback:
190 |                 if agent_correct == human_feedback:
191 |                     self._update_source_confidence(domain, query_type, True, 1.0)
192 |                     source_info.notes.append(f"[{current_time.isoformat()}] Accurate assessment confirmed by human feedback")
193 |                 else:
194 |                     self._update_source_confidence(domain, query_type, False, 1.0)
195 |                     source_info.notes.append(f"[{current_time.isoformat()}] Assessment contradicted by human feedback")
196 |             else:
197 |                 self._update_source_confidence(domain, query_type, agent_correct, agent_confidence)
198 |         
199 |         self.save_memory()
200 |         
201 |     def _update_source_confidence(self, domain: str, query_type: str, success: bool, confidence: float):
202 |         source = self.source_reliability[domain]
203 |         
204 |         if query_type not in source.query_types:
205 |             source.query_types[query_type] = 0.0
206 |             
207 |         current_reliability = source.query_types[query_type]
208 |         
209 |         if success:
210 |             new_reliability = current_reliability + (1 - current_reliability) * confidence * 0.1
211 |         else:
212 |             new_reliability = current_reliability * 0.8
213 |             
214 |         source.query_types[query_type] = max(0.0, min(1.0, new_reliability))
215 |     
216 |     def get_feedback_stats(self, domain: str = None, query_type: str = None) -> Dict:
217 |         stats = {
218 |             'total_entries': 0,
219 |             'agent_accuracy': 0.0,
220 |             'human_agreement': 0.0,
221 |             'query_type_performance': {},
222 |             'recent_trends': []
223 |         }
224 |         
225 |         relevant_entries = []
226 |         
227 |         for topic_entries in self.feedback_history.values():
228 |             for entry in topic_entries:
229 |                 if domain and not any(domain in s for s in entry['sources']):
230 |                     continue
231 |                 if query_type and entry['query_type'] != query_type:
232 |                     continue
233 |                 relevant_entries.append(entry)
234 |         
235 |         if not relevant_entries:
236 |             return stats
237 |             
238 |         stats['total_entries'] = len(relevant_entries)
239 |         
240 |         correct_assessments = sum(1 for e in relevant_entries 
241 |                                 if e['agent_assessment'].get('is_accurate') == e['human_feedback'])
242 |         human_agreements = sum(1 for e in relevant_entries if e['human_feedback'])
243 |         
244 |         stats['agent_accuracy'] = correct_assessments / len(relevant_entries)
245 |         stats['human_agreement'] = human_agreements / len(relevant_entries)
246 |         
247 |         query_types = {}
248 |         for entry in relevant_entries:
249 |             qt = entry['query_type']
250 |             if qt not in query_types:
251 |                 query_types[qt] = {'total': 0, 'successful': 0}
252 |             query_types[qt]['total'] += 1
253 |             if entry['human_feedback']:
254 |                 query_types[qt]['successful'] += 1
255 |         
256 |         stats['query_type_performance'] = {
257 |             qt: {'success_rate': data['successful'] / data['total']}
258 |             for qt, data in query_types.items()
259 |         }
260 |         
261 |         recent = relevant_entries[-10:]
262 |         stats['recent_trends'] = [
263 |             {
264 |                 'timestamp': e['timestamp'],
265 |                 'query_type': e['query_type'],
266 |                 'success': e['human_feedback']
267 |             }
268 |             for e in recent
269 |         ]
270 |         
271 |         return stats


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | langchain
 2 | langchain_community
 3 | langchain_ollama
 4 | langchain_groq
 5 | requests
 6 | beautifulsoup4
 7 | pytz
 8 | selenium
 9 | webdriver_manager
10 | pillow
11 | geocoder


--------------------------------------------------------------------------------
/source_reliable/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/source_reliable/__init__.py


--------------------------------------------------------------------------------
/source_reliable/source_reliability_class.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from datetime import datetime
 3 | from typing import Dict, List, Optional
 4 | 
 5 | @dataclass
 6 | class SourceReliability:
 7 |     domain: str
 8 |     query_types: Dict[str, float]
 9 |     last_success: Optional[datetime]
10 |     last_failure: Optional[datetime]
11 |     total_attempts: int
12 |     successful_attempts: int
13 |     average_response_time: float
14 |     notes: List[str]


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/test/__init__.py


--------------------------------------------------------------------------------
/test/ollama.py:
--------------------------------------------------------------------------------
 1 | from config.log import logger
 2 | from langchain_ollama import ChatOllama
 3 | from langchain.schema import HumanMessage, AIMessage
 4 | 
 5 | def test_ollama() -> bool:
 6 |     """Test if Ollama is running and accessible."""
 7 |     try:
 8 |         test_llm = ChatOllama(
 9 |             model="llama3.2:3b-instruct-q8_0",
10 |             base_url="http://localhost:11434",
11 |             temperature=0,
12 |             num_gpu=1,
13 |             num_thread=8
14 |         )
15 |         resp = test_llm([HumanMessage(content="Hello")])
16 |         if isinstance(resp, AIMessage) and len(resp.content) > 0:
17 |             logger.info("✅ Ollama is accessible")
18 |             return True
19 |         else:
20 |             logger.error("❌ Ollama did not return a valid response")
21 |             return False
22 |     except Exception as e:
23 |         logger.error(f"❌ Ollama test failed: {str(e)}")
24 |         return False


--------------------------------------------------------------------------------
/test/selenium.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.chrome.service import Service
 3 | from selenium.webdriver.chrome.options import Options
 4 | from webdriver_manager.chrome import ChromeDriverManager
 5 | import time
 6 | import io
 7 | from PIL import Image
 8 | from tools.size_limit import ensure_size_within_limits
 9 | from config.log import logger
10 | 
11 | def test_selenium() -> bool:
12 |     """Test if Selenium can run and capture a screenshot of a test page using Chrome."""
13 |     try:
14 |         chrome_options = Options()
15 |         chrome_options.add_argument('--headless')
16 |         chrome_options.add_argument('--window-size=1920,1080')
17 |         chrome_options.add_argument('--disable-gpu')  # To avoid potential issues with headless mode
18 |         
19 |         driver = webdriver.Chrome(
20 |             service=Service(ChromeDriverManager().install()),
21 |             options=chrome_options
22 |         )
23 |         
24 |         driver.set_page_load_timeout(20)
25 |         driver.get("https://example.com")
26 |         
27 |         # Set zoom level for better text legibility
28 |         driver.execute_script("document.body.style.zoom = '200%'")  # Increased from 150%
29 |         
30 |         # Ensure text is readable
31 |         driver.execute_script(""" 
32 |             document.querySelectorAll('*').forEach(function(el) {
33 |                 let style = window.getComputedStyle(el);
34 |                 if (parseInt(style.fontSize) < 16) {  // Increased minimum font size
35 |                     el.style.fontSize = '16px';
36 |                 }
37 |                 // Improve contrast
38 |                 if (style.color && style.backgroundColor) {
39 |                     let textColor = style.color;
40 |                     let bgColor = style.backgroundColor;
41 |                     if (textColor === bgColor || textColor === 'rgba(0, 0, 0, 0)') {
42 |                         el.style.color = '#000000';
43 |                     }
44 |                 }
45 |             });
46 |         """)
47 |         
48 |         # Additional wait for text scaling
49 |         time.sleep(1)
50 |         
51 |         # Get page dimensions with padding for better quality
52 |         total_height = driver.execute_script("return Math.max(document.documentElement.scrollHeight, document.body.scrollHeight);")
53 |         total_width = driver.execute_script("return Math.max(document.documentElement.scrollWidth, document.body.scrollWidth);")
54 |         
55 |         # Add padding and ensure minimum dimensions
56 |         total_width = max(total_width, 1920)
57 |         total_height = int(total_height * 1.1)
58 |         
59 |         # Ensure dimensions are within pixel limit
60 |         final_width, final_height = ensure_size_within_limits(total_width, total_height)
61 |         
62 |         # Set window size with the adjusted dimensions
63 |         driver.set_window_size(final_width, final_height)
64 |         
65 |         # Wait for any dynamic content to load
66 |         time.sleep(1)
67 |         
68 |         # Capture full screenshot in memory with high quality
69 |         screenshot_png = driver.get_screenshot_as_png()
70 |         driver.quit()
71 | 
72 |         # Decode and verify image
73 |         img = Image.open(io.BytesIO(screenshot_png))
74 |         img.verify()
75 |         logger.info(f"✅ Selenium is running with Chrome and captured screenshot ({img.size[0]}x{img.size[1]} px)")
76 |         return True
77 |     except Exception as e:
78 |         logger.error(f"❌ Selenium test failed: {str(e)}")
79 |         return False


--------------------------------------------------------------------------------
/test/test_model.py:
--------------------------------------------------------------------------------
 1 | from config.log import logger
 2 | from Model.provider import ModelProvider
 3 | from config.settings import GROQ_API_KEY
 4 | import requests
 5 | from test.ollama import test_ollama
 6 | 
 7 | def test_model_provider(provider: str) -> bool:
 8 |     """Test if the selected model provider is accessible."""
 9 |     try:
10 |         if provider == ModelProvider.OLLAMA:
11 |             return test_ollama()
12 |         else:
13 |             if not GROQ_API_KEY:
14 |                 logger.error("GROQ_API_KEY not set.")
15 |                 return False
16 |             headers = {"Authorization": f"Bearer {GROQ_API_KEY}"}
17 |             response = requests.get("https://api.groq.com/openai/v1/models", headers=headers)
18 |             response.raise_for_status()
19 |             logger.info("✅ Groq API is accessible")
20 |             return True
21 |     except Exception as e:
22 |         logger.error(f"❌ {provider.capitalize()} test failed: {str(e)}")
23 |         return False


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haseebasif7/SurfAgent/e527e1bce6234e3b4456298811550bf3a7552375/tools/__init__.py


--------------------------------------------------------------------------------
/tools/capture_ss.py:
--------------------------------------------------------------------------------
 1 | from config.log import logger
 2 | from PIL import Image
 3 | import io
 4 | import math
 5 | import time
 6 | from tools.size_limit import ensure_size_within_limits
 7 | 
 8 | def capture_full_page_screenshot(driver, url: str) -> bytes:
 9 |     """Capture a full page screenshot by scrolling and stitching."""
10 |     try:
11 |         # Get initial dimensions
12 |         total_height = driver.execute_script("return Math.max(document.documentElement.scrollHeight, document.body.scrollHeight);")
13 |         total_width = driver.execute_script("return Math.max(document.documentElement.scrollWidth, document.body.scrollWidth);")
14 |         
15 |         # Calculate viewport height
16 |         viewport_height = driver.execute_script("return window.innerHeight;")
17 |         
18 |         # Pre-calculate final dimensions to ensure they're within limits
19 |         MAX_PIXELS = 33177600 * 0.9  # 10% safety margin
20 |         
21 |         # If the page is very long, we'll split it into sections
22 |         if total_height > 15000 or (total_width * total_height) > MAX_PIXELS:
23 |             # Calculate maximum height that would fit within pixel limit
24 |             max_safe_height = int(MAX_PIXELS / total_width)
25 |             
26 |             # Adjust section size based on max safe height
27 |             section_height = min(viewport_height, max_safe_height // 4)  # Use quarter of max safe height per section
28 |             
29 |             sections = []
30 |             offset = 0
31 |             while offset < total_height:
32 |                 # Scroll to position
33 |                 driver.execute_script(f"window.scrollTo(0, {offset});")
34 |                 time.sleep(0.5)  # Wait for scroll and content to load
35 |                 
36 |                 # Capture viewport
37 |                 section_png = driver.get_screenshot_as_png()
38 |                 section = Image.open(io.BytesIO(section_png))
39 |                 
40 |                 # Ensure section is within limits
41 |                 if section.height > section_height:
42 |                     section = section.crop((0, 0, section.width, section_height))
43 |                 
44 |                 sections.append(section)
45 |                 offset += section_height
46 |             
47 |             # Calculate final dimensions ensuring they're within limits
48 |             final_width = min(total_width, 1920)  # Cap width at 1920px
49 |             final_height = min(total_height, int(MAX_PIXELS / final_width))
50 |             
51 |             # Create new image with calculated dimensions
52 |             final_image = Image.new('RGB', (final_width, final_height))
53 |             y_offset = 0
54 |             
55 |             for section in sections:
56 |                 if y_offset + section.height > final_height:
57 |                     # Crop section if it would exceed final height
58 |                     remaining_height = final_height - y_offset
59 |                     if remaining_height <= 0:
60 |                         break
61 |                     section = section.crop((0, 0, section.width, remaining_height))
62 |                 
63 |                 final_image.paste(section, (0, y_offset))
64 |                 y_offset += section.height
65 |                 if y_offset >= final_height:
66 |                     break
67 |             
68 |             # Verify final size
69 |             if final_image.width * final_image.height > MAX_PIXELS:
70 |                 # Resize if somehow still too large
71 |                 scale = math.sqrt(MAX_PIXELS / (final_image.width * final_image.height))
72 |                 new_width = int(final_image.width * scale)
73 |                 new_height = int(final_image.height * scale)
74 |                 final_image = final_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
75 |             
76 |             # Convert to PNG
77 |             output = io.BytesIO()
78 |             final_image.save(output, format='PNG', optimize=True)
79 |             return output.getvalue()
80 |         else:
81 |             # For shorter pages, still ensure we're within limits
82 |             final_width, final_height = ensure_size_within_limits(total_width, total_height)
83 |             driver.set_window_size(final_width, final_height)
84 |             time.sleep(0.5)
85 |             return driver.get_screenshot_as_png()
86 |             
87 |     except Exception as e:
88 |         logger.error(f"Error in full page capture: {str(e)}")
89 |         # Fallback to a safe capture
90 |         safe_width, safe_height = ensure_size_within_limits(1920, 1080)
91 |         driver.set_window_size(safe_width, safe_height)
92 |         return driver.get_screenshot_as_png()


--------------------------------------------------------------------------------
/tools/create_vecstore.py:
--------------------------------------------------------------------------------
 1 | from langchain_community.vectorstores import SKLearnVectorStore
 2 | from langchain_ollama import OllamaEmbeddings
 3 | 
 4 | def create_vectorstore(docs_splits):
 5 |     embeddings = OllamaEmbeddings(
 6 |         model="all-minilm",
 7 |         base_url="http://localhost:11434/v1"
 8 |     )
 9 |     vectorstore = SKLearnVectorStore.from_documents(
10 |         documents=docs_splits,
11 |         embedding=embeddings
12 |     )
13 |     return vectorstore.as_retriever(k=4)


--------------------------------------------------------------------------------
/tools/extract_urls.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from urllib.parse import urlparse
 3 | from typing import List
 4 | from config.log import logger
 5 | from tools.host_tracker import host_tracker 
 6 | 
 7 | def extract_urls_from_search_results(search_text: str) -> List[str]:
 8 |     urls = re.findall(r'(https?://[^\s\'"]+)', search_text)
 9 |     valid_urls = []
10 |     for url in urls:
11 |         url = re.sub(r'[.,)\]]+$', '', url)
12 |         if url.startswith(('http://', 'https://')):
13 |             if not host_tracker.is_problematic_host(url):
14 |                 valid_urls.append(url)
15 |             else:
16 |                 logger.info(f"Filtered out problematic host: {urlparse(url).netloc}")
17 |     return list(set(valid_urls))


--------------------------------------------------------------------------------
/tools/fetch_webpage.py:
--------------------------------------------------------------------------------
  1 | from config.log import logger
  2 | from tools.host_tracker import host_tracker 
  3 | from PIL import Image
  4 | from selenium import webdriver
  5 | from selenium.webdriver.chrome.service import Service
  6 | from selenium.webdriver.chrome.options import Options
  7 | from webdriver_manager.chrome import ChromeDriverManager
  8 | from urllib.parse import urlparse
  9 | import time
 10 | import io
 11 | import base64
 12 | from tools.size_limit import ensure_size_within_limits
 13 | from tools.capture_ss import capture_full_page_screenshot
 14 | from configure.vision import configure_vision_model
 15 | from configure.config_llm import configure_llm
 16 | from tools.vision_query import generate_vision_query
 17 | 
 18 | 
 19 | def fetch_webpage_content(url: str, provider: str, original_query: str) -> str:
 20 |     """Fetch webpage content by capturing a screenshot via Selenium and processing it with a vision model."""
 21 |     if host_tracker .is_problematic_host(url):
 22 |         logger.info(f"Skipping known problematic host: {urlparse(url).netloc}")
 23 |         return f"Skipped: Known problematic host"
 24 | 
 25 |     try:
 26 |         # Set up Selenium (headless Chrome)
 27 |         chrome_options = Options()
 28 |         chrome_options.add_argument('--headless')
 29 |         chrome_options.add_argument('--window-size=1920,1080')
 30 |         chrome_options.add_argument('--disable-gpu')  # To avoid potential issues with headless mode
 31 |         chrome_options.add_argument('--disable-blink-features=AutomationControlled')  # Hide automation
 32 |         chrome_options.add_argument('--disable-notifications')
 33 |         
 34 |         # Add headers to appear more like a real browser
 35 |         chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36')
 36 |         
 37 |         driver = webdriver.Chrome(
 38 |             service=Service(ChromeDriverManager().install()),
 39 |             options=chrome_options
 40 |         )
 41 |         driver.set_page_load_timeout(60)
 42 |         
 43 |         # Set cookies and localStorage to bypass some anti-bot measures
 44 |         driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
 45 |         
 46 |         # Attempt to load the page
 47 |         driver.get(url)
 48 |         time.sleep(2)  # Give time for dynamic content to load
 49 |         
 50 |         # Check for and handle CAPTCHA/cookie popups
 51 |         try:
 52 |             driver.execute_script("""
 53 |                 // Remove common overlay elements
 54 |                 document.querySelectorAll('[class*="cookie"], [class*="popup"], [class*="modal"], [id*="cookie"], [id*="popup"], [id*="modal"]')
 55 |                     .forEach(el => el.remove());
 56 |                 // Remove fixed position elements that might overlay content
 57 |                 document.querySelectorAll('*').forEach(el => {
 58 |                     const style = window.getComputedStyle(el);
 59 |                     if (style.position === 'fixed' || style.position === 'sticky') {
 60 |                         el.remove();
 61 |                     }
 62 |                 });
 63 |             """)
 64 |         except Exception as e:
 65 |             logger.warning(f"Error handling overlays: {str(e)}")
 66 |         
 67 |         # Set text size and ensure readability with special handling for financial data
 68 |         driver.execute_script("""
 69 |             // Set base zoom
 70 |             document.body.style.zoom = '200%';  // Increased from 125%
 71 |             
 72 |             // Function to check if text might be financial data
 73 |             function isFinancialData(text) {
 74 |                 return /\\$|\\d+\\.\\d+|\\d+%|price|stock|market|share/i.test(text);
 75 |             }
 76 |             
 77 |             // Ensure text is readable with special handling for financial data
 78 |             document.querySelectorAll('*').forEach(function(el) {
 79 |                 let style = window.getComputedStyle(el);
 80 |                 let text = el.textContent || '';
 81 |                 
 82 |                 // Special handling for financial data
 83 |                 if (isFinancialData(text)) {
 84 |                     el.style.fontSize = '24px';  // Larger size for financial data
 85 |                     el.style.fontWeight = 'bold';
 86 |                     el.style.color = '#000000';  // Ensure high contrast
 87 |                 } else if (parseInt(style.fontSize) < 16) {  // Increased minimum font size
 88 |                     el.style.fontSize = '16px';
 89 |                 }
 90 |                 
 91 |                 // Improve contrast
 92 |                 if (style.color && style.backgroundColor) {
 93 |                     let textColor = style.color;
 94 |                     let bgColor = style.backgroundColor;
 95 |                     if (textColor === bgColor || textColor === 'rgba(0, 0, 0, 0)' || 
 96 |                         textColor === 'rgb(255, 255, 255)' || textColor === '#ffffff') {
 97 |                         el.style.color = '#000000';
 98 |                     }
 99 |                 }
100 |                 
101 |                 // Improve visibility of links
102 |                 if (el.tagName.toLowerCase() === 'a') {
103 |                     el.style.textDecoration = 'underline';
104 |                 }
105 |             });
106 |             
107 |             // Additional handling for table cells (common in financial data)
108 |             document.querySelectorAll('td, th').forEach(function(el) {
109 |                 let text = el.textContent || '';
110 |                 if (isFinancialData(text)) {
111 |                     el.style.padding = '10px';
112 |                     el.style.fontSize = '24px';
113 |                     el.style.fontWeight = 'bold';
114 |                 }
115 |             });
116 |         """)
117 | 
118 |         # Additional wait for text adjustments
119 |         time.sleep(2)  # Increased wait time
120 |         
121 |         # Get dimensions and ensure they're within limits
122 |         total_height = driver.execute_script("return Math.max(document.documentElement.scrollHeight, document.body.scrollHeight);")
123 |         total_width = driver.execute_script("return Math.max(document.documentElement.scrollWidth, document.body.scrollWidth);")
124 |         
125 |         final_width, final_height = ensure_size_within_limits(total_width, total_height)
126 |         logger.info(f"Adjusted dimensions to {final_width}x{final_height} to stay within pixel limit")
127 |         
128 |         # Set final window size
129 |         driver.set_window_size(final_width, final_height)
130 |         time.sleep(1)
131 |         
132 |         # Capture the screenshot using our improved method
133 |         screenshot_png = capture_full_page_screenshot(driver, url)
134 |         driver.quit()
135 | 
136 |         # Process the image
137 |         img = Image.open(io.BytesIO(screenshot_png))
138 |         
139 |         # Convert to RGB and enhance readability
140 |         if img.mode in ('RGBA', 'P'):
141 |             img = img.convert('RGB')
142 |         
143 |         # Enhance image quality with specified values
144 |         from PIL import ImageEnhance
145 |         enhancer = ImageEnhance.Sharpness(img)
146 |         img = enhancer.enhance(1.25)  # Modified sharpness value
147 |         enhancer = ImageEnhance.Contrast(img)
148 |         img = enhancer.enhance(1.25)  # Modified contrast value
149 |         
150 |         # Save with high quality
151 |         output = io.BytesIO()
152 |         img.save(output, format='JPEG', quality=100, optimize=True)  # Maximum quality
153 |         screenshot_data = output.getvalue()
154 |         
155 |         # Convert to base64
156 |         base64_image = base64.b64encode(screenshot_data).decode('utf-8')
157 |         
158 |         vision_llm = configure_vision_model(provider)
159 |         text_llm = configure_llm(provider)
160 |         
161 |         vision_query = generate_vision_query(text_llm, original_query)
162 |         logger.info(f"Using vision query: {vision_query}")
163 | 
164 |         messages = [
165 |             {
166 |                 "role": "user",
167 |                 "content": [
168 |                     {"type": "text", "text": vision_query},
169 |                     {
170 |                         "type": "image_url",
171 |                         "image_url": {
172 |                             "url": f"data:image/jpeg;base64,{base64_image}",
173 |                             "detail": "high"
174 |                         }
175 |                     }
176 |                 ]
177 |             }
178 |         ]
179 |         
180 |         logger.info(f"Processing screenshot from {url} with vision model ({provider})")
181 |         vision_response = vision_llm.invoke(messages)
182 |         
183 |         extracted_text = vision_response.content.strip()
184 |         
185 |         print("\n" + "="*80)
186 |         print(f"Vision Model Description for {url}:")
187 |         print("-"*80)
188 |         print(extracted_text)
189 |         print("="*80 + "\n")
190 |         
191 |         logger.info(f"Successfully processed content from {url}")
192 |         return extracted_text
193 |         
194 |     except Exception as e:
195 |         host_tracker .add_failed_host(url)
196 |         logger.error(f"Error processing {url}: {str(e)}")
197 |         return f"Error processing {url}: {str(e)}"


--------------------------------------------------------------------------------
/tools/host_tracker.py:
--------------------------------------------------------------------------------
 1 | from config.log import logger
 2 | from urllib.parse import urlparse
 3 | import os
 4 | 
 5 | class HostTracker:
 6 |     def __init__(self, filename="HOSTS.txt"):
 7 |         self.filename = filename
 8 |         self.failed_hosts = set()
 9 |         self.load_failed_hosts()
10 |     
11 |     def load_failed_hosts(self):
12 |         """Load failed hosts from file."""
13 |         try:
14 |             if os.path.exists(self.filename):
15 |                 with open(self.filename, 'r') as f:
16 |                     self.failed_hosts = set(line.strip() for line in f if line.strip())
17 |                 logger.info(f"Loaded {len(self.failed_hosts)} problematic hosts from {self.filename}")
18 |         except Exception as e:
19 |             logger.error(f"Error loading failed hosts: {str(e)}")
20 |             self.failed_hosts = set()
21 |     
22 |     def add_failed_host(self, url: str):
23 |         """Add a failed host to the tracking list."""
24 |         try:
25 |             host = urlparse(url).netloc
26 |             if host and host not in self.failed_hosts:
27 |                 self.failed_hosts.add(host)
28 |                 with open(self.filename, 'a') as f:
29 |                     f.write(f"{host}\n")
30 |                 logger.info(f"Added {host} to problematic hosts list")
31 |         except Exception as e:
32 |             logger.error(f"Error adding failed host: {str(e)}")
33 |     
34 |     def is_problematic_host(self, url: str) -> bool:
35 |         """Check if a URL's host is in the problematic list."""
36 |         try:
37 |             host = urlparse(url).netloc
38 |             return host in self.failed_hosts
39 |         except Exception:
40 |             return False
41 |         
42 | host_tracker = HostTracker()


--------------------------------------------------------------------------------
/tools/size_limit.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | def ensure_size_within_limits(width: int, height: int, max_pixels: int = 33177600) -> tuple:
 4 |     """Ensure dimensions are within the pixel limit while maintaining aspect ratio."""
 5 |     total_pixels = width * height
 6 |     
 7 |     max_pixels = int(max_pixels * 0.9)  
 8 |     
 9 |     if total_pixels <= max_pixels:
10 |         return width, height
11 |     
12 |     # Calculate scaling factor to fit within limit
13 |     scale = math.sqrt(max_pixels / total_pixels)
14 |     new_width = int(width * scale)
15 |     new_height = int(height * scale)
16 |     
17 |     if new_width * new_height > max_pixels:
18 |         scale *= 0.95
19 |         new_width = int(width * scale)
20 |         new_height = int(height * scale)
21 |     
22 |     return new_width, new_height


--------------------------------------------------------------------------------
/tools/split_doc.py:
--------------------------------------------------------------------------------
1 | from langchain.text_splitter import RecursiveCharacterTextSplitter
2 | 
3 | def split_documents(docs):
4 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
5 |     return text_splitter.split_documents(docs)


--------------------------------------------------------------------------------
/tools/topic_into_sub.py:
--------------------------------------------------------------------------------
 1 | from Model.invokemodel import invoke_model
 2 | 
 3 | def decompose_topic_into_subtopics(llm, topic):
 4 |     decomposition_prompt = f"""You are a research assistant.
 5 | You will be given a research topic. If the topic is broad or complex, break it down into a list of more specific subtopics or sub-questions that would help in researching it thoroughly.
 6 | If the topic is simple or already focused, just return it as is.
 7 | Format your response as a simple list with one subtopic per line.
 8 | 
 9 | Topic: {topic}
10 | 
11 | Subtopics:"""
12 |     response = invoke_model(llm, decomposition_prompt)
13 |     response_text = response.content
14 |     subtopics = [line.strip("- ").strip() for line in response_text.split("\n") if line.strip()]
15 |     subtopics = [s for s in subtopics if s and not s.lower().startswith(("subtopic", "topic"))]
16 |     return subtopics if subtopics else [topic]


--------------------------------------------------------------------------------
/tools/vision_query.py:
--------------------------------------------------------------------------------
 1 | from Model.invokemodel import invoke_model
 2 | from config.log import logger
 3 | import re
 4 | 
 5 | def generate_vision_query(llm, original_query: str) -> str:
 6 |     """Generate a focused vision query based on the original research question."""
 7 |     prompt = f"""You are a tool assisting in generating natural and concise vision model queries.
 8 |     Your task is to transform research questions into specific and actionable prompts that guide the model in analyzing webpage screenshots.
 9 |     The queries should:
10 |     - Start with "Describe the image in detail, focusing on".
11 |     - Be natural, concise, and no longer than 15 words.
12 |     - Highlight the most relevant information to answer the research question.
13 |     - Avoid mechanical or vague phrases like "extract X from the image."
14 |     
15 |     Examples:
16 |     Research question: What is the current Tesla stock price?
17 |     Vision query: Describe the image in detail, focusing on the specific Tesla stock price.
18 | 
19 |     Research question: What are the iPhone 15 specs?
20 |     Vision query: Describe the image in detail, focusing on the iPhone 15 specifications and features.
21 | 
22 |     Research question: How much does the latest MacBook Pro cost?
23 |     Vision query: Describe the image in detail, focusing on the price of the latest MacBook Pro.
24 | 
25 |     Research question: {original_query}
26 | 
27 |     Vision query:"""
28 |     
29 |     try:
30 |         response = invoke_model(llm, prompt)
31 |         vision_query = response.content.strip()
32 |         
33 |         # Clean up and standardize the query
34 |         vision_query = vision_query.replace('"', '').replace("'", '')
35 |         if not vision_query.lower().startswith("describe the image"):
36 |             vision_query = f"Describe the image in detail, focusing on {vision_query}"
37 |         
38 |         # Remove mechanical phrases
39 |         vision_query = vision_query.replace("extract from the image", "")
40 |         vision_query = vision_query.replace("from the image", "")
41 |         vision_query = re.sub(r'\s+', ' ', vision_query).strip()
42 |         
43 |         # Ensure it ends properly
44 |         if vision_query.endswith("focusing on"):
45 |             vision_query = vision_query[:-11].strip()
46 |         
47 |         return vision_query
48 |     except Exception as e:
49 |         logger.error(f"Error generating vision query: {str(e)}")
50 |         return "Describe the image in detail, focusing on the main content and key information."
51 | 


--------------------------------------------------------------------------------