config

Personal configuration.
git clone git://code.dwrz.net/config
Log | Files | Refs

gptel-gemini.el (13592B)


      1 ;;; gptel-gemini.el ---  Gemini suppport for gptel  -*- lexical-binding: t; -*-
      2 
      3 ;; Copyright (C) 2023  Karthik Chikmagalur
      4 
      5 ;; Author: Karthik Chikmagalur <karthikchikmagalur@gmail.com>
      6 
      7 ;; This program is free software; you can redistribute it and/or modify
      8 ;; it under the terms of the GNU General Public License as published by
      9 ;; the Free Software Foundation, either version 3 of the License, or
     10 ;; (at your option) any later version.
     11 
     12 ;; This program is distributed in the hope that it will be useful,
     13 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 ;; GNU General Public License for more details.
     16 
     17 ;; You should have received a copy of the GNU General Public License
     18 ;; along with this program.  If not, see <https://www.gnu.org/licenses/>.
     19 
     20 ;;; Commentary:
     21 
     22 ;; This file adds support for the Gemini API to gptel
     23 
     24 ;;; Code:
     25 (require 'gptel)
     26 (require 'cl-generic)
     27 (require 'map)
     28 (eval-when-compile (require 'cl-lib))
     29 
     30 (declare-function prop-match-value "text-property-search")
     31 (declare-function text-property-search-backward "text-property-search")
     32 (declare-function json-read "json")
     33 (declare-function gptel-context--wrap "gptel-context")
     34 (declare-function gptel-context--collect-media "gptel-context")
     35 (defvar json-object-type)
     36 
     37 ;;; Gemini
     38 (cl-defstruct
     39     (gptel-gemini (:constructor gptel--make-gemini)
     40                   (:copier nil)
     41                   (:include gptel-backend)))
     42 
     43 (cl-defmethod gptel-curl--parse-stream ((_backend gptel-gemini) _info)
     44   (let* ((content-strs))
     45     (condition-case nil
     46         ;; while-let is Emacs 29.1+ only
     47         (while (prog1 (search-forward "{" nil t)
     48                  (backward-char 1))
     49           (save-match-data
     50             (when-let*
     51                 ((response (gptel--json-read))
     52                  (text (map-nested-elt
     53                         response '(:candidates 0 :content :parts 0 :text))))
     54               (push text content-strs))))
     55       (error
     56        (goto-char (match-beginning 0))))
     57     (apply #'concat (nreverse content-strs))))
     58 
     59 (cl-defmethod gptel--parse-response ((_backend gptel-gemini) response _info)
     60   (map-nested-elt response '(:candidates 0 :content :parts 0 :text)))
     61 
     62 (cl-defmethod gptel--request-data ((_backend gptel-gemini) prompts)
     63   "JSON encode PROMPTS for sending to Gemini."
     64   ;; HACK (backwards compatibility) Prepend the system message to the first user
     65   ;; prompt, but only for gemini-pro.
     66   (when (and (equal gptel-model 'gemini-pro) gptel--system-message)
     67     (cl-callf
     68         (lambda (msg)
     69           (vconcat `((:text ,(concat gptel--system-message "\n\n"))) msg))
     70         (thread-first (car prompts)
     71                       (plist-get :parts))))
     72   (let ((prompts-plist
     73          `(:contents [,@prompts]
     74            :safetySettings [(:category "HARM_CATEGORY_HARASSMENT"
     75                              :threshold "BLOCK_NONE")
     76                             (:category "HARM_CATEGORY_SEXUALLY_EXPLICIT"
     77                              :threshold "BLOCK_NONE")
     78                             (:category "HARM_CATEGORY_DANGEROUS_CONTENT"
     79                              :threshold "BLOCK_NONE")
     80                             (:category "HARM_CATEGORY_HATE_SPEECH"
     81                              :threshold "BLOCK_NONE")]))
     82         params)
     83     ;; HACK only gemini-pro doesn't support system messages.  Need a less hacky
     84     ;; way to do this.
     85     (if (and gptel--system-message
     86              (not (gptel--model-capable-p 'nosystem))
     87              (not (equal gptel-model 'gemini-pro)))
     88       (plist-put prompts-plist :system_instruction
     89                  `(:parts (:text ,gptel--system-message))))
     90     (when gptel-temperature
     91       (setq params
     92             (plist-put params
     93                        :temperature (max gptel-temperature 1.0))))
     94     (when gptel-max-tokens
     95       (setq params
     96             (plist-put params
     97                        :maxOutputTokens gptel-max-tokens)))
     98     (when params
     99       (plist-put prompts-plist
    100                  :generationConfig params))
    101     ;; Merge request params with model and backend params.
    102     (gptel--merge-plists
    103      prompts-plist
    104      (gptel-backend-request-params gptel-backend)
    105      (gptel--model-request-params  gptel-model))))
    106 
    107 (cl-defmethod gptel--parse-list ((_backend gptel-gemini) prompt-list)
    108   (cl-loop for text in prompt-list
    109            for role = t then (not role)
    110            if text
    111            if role
    112            collect (list :role "user" :parts `[(:text ,text)]) into prompts
    113            else collect (list :role "model" :parts `(:text ,text)) into prompts
    114            finally return prompts))
    115 
    116 (cl-defmethod gptel--parse-buffer ((_backend gptel-gemini) &optional max-entries)
    117   (let ((prompts) (prop)
    118         (include-media (and gptel-track-media (or (gptel--model-capable-p 'media)
    119                                                   (gptel--model-capable-p 'url)))))
    120     (if (or gptel-mode gptel-track-response)
    121         (while (and
    122                 (or (not max-entries) (>= max-entries 0))
    123                 (setq prop (text-property-search-backward
    124                             'gptel 'response
    125                             (when (get-char-property (max (point-min) (1- (point)))
    126                                                      'gptel)
    127                               t))))
    128           (if (prop-match-value prop)   ;assistant role
    129               (push (list :role "model"
    130                           :parts
    131                           (list :text (buffer-substring-no-properties (prop-match-beginning prop)
    132                                                                       (prop-match-end prop))))
    133                     prompts)
    134             (if include-media
    135                 (push (list :role "user"
    136                             :parts (gptel--gemini-parse-multipart
    137                                     (gptel--parse-media-links
    138                                      major-mode (prop-match-beginning prop) (prop-match-end prop))))
    139                       prompts)
    140               (push (list :role "user"
    141                           :parts
    142                           `[(:text ,(gptel--trim-prefixes
    143                                      (buffer-substring-no-properties (prop-match-beginning prop)
    144                                       (prop-match-end prop))))])
    145                     prompts)))
    146           (and max-entries (cl-decf max-entries)))
    147       (push (list :role "user"
    148                   :parts
    149                   `[(:text ,(string-trim (buffer-substring-no-properties (point-min) (point-max))))])
    150             prompts))
    151     prompts))
    152 
    153 (defun gptel--gemini-parse-multipart (parts)
    154   "Convert a multipart prompt PARTS to the Gemini API format.
    155 
    156 The input is an alist of the form
    157  ((:text \"some text\")
    158   (:media \"/path/to/media.png\" :mime \"image/png\")
    159   (:text \"More text\")).
    160 
    161 The output is a vector of entries in a backend-appropriate
    162 format."
    163   (cl-loop
    164    for part in parts
    165    for n upfrom 1
    166    with last = (length parts)
    167    for text = (plist-get part :text)
    168    for media = (plist-get part :media)
    169    if text do
    170    (and (or (= n 1) (= n last)) (setq text (gptel--trim-prefixes text))) and
    171    unless (string-empty-p text)
    172    collect (list :text text) into parts-array end
    173    else if media
    174    collect
    175    `(:inline_data
    176      (:mime_type ,(plist-get part :mime)
    177       :data ,(gptel--base64-encode media)))
    178    into parts-array
    179    finally return (vconcat parts-array)))
    180 
    181 (cl-defmethod gptel--wrap-user-prompt ((_backend gptel-gemini) prompts
    182                                        &optional inject-media)
    183   "Wrap the last user prompt in PROMPTS with the context string.
    184 
    185 If INJECT-MEDIA is non-nil wrap it with base64-encoded media
    186 files in the context."
    187   (if inject-media
    188       ;; Wrap the first user prompt with included media files/contexts
    189       (when-let ((media-list (gptel-context--collect-media)))
    190         (cl-callf (lambda (current)
    191                     (vconcat (gptel--gemini-parse-multipart media-list)
    192                              current))
    193             (plist-get (car prompts) :parts)))
    194     ;; Wrap the last user prompt with included text contexts
    195     (cl-callf (lambda (current)
    196                 (if-let ((wrapped (gptel-context--wrap nil)))
    197                     (vconcat `((:text ,wrapped)) current)
    198                   current))
    199         (plist-get (car (last prompts)) :parts))))
    200 
    201 (defconst gptel--gemini-models
    202   '((gemini-pro
    203      :description "The previous generation of Google's multimodal AI model"
    204      :capabilities (tool json media)
    205      :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif"
    206                   "application/pdf" "text/plain" "text/csv" "text/html")
    207      :context-window 32
    208      :input-cost 0.50
    209      :output-cost 1.50
    210      :cutoff-date "2023-02")
    211     (gemini-1.5-flash
    212      :description "A faster, more efficient version of Gemini 1.5 optimized for speed"
    213      :capabilities (tool json media)
    214      :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif"
    215                   "application/pdf" "text/plain" "text/csv" "text/html")
    216      :context-window 1000
    217      ;; input & output price is halved for prompts of 128k tokens or less
    218      :input-cost 0.15
    219      :output-cost 0.60
    220      :cutoff-date "2024-05")
    221     (gemini-1.5-pro-latest
    222      :description "Google's latest model with enhanced capabilities across various tasks"
    223      :capabilities (tool json media)
    224      :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif"
    225                   "application/pdf" "text/plain" "text/csv" "text/html")
    226      :context-window 2000
    227      ;; input & output price is halved for prompts of 128k tokens or less
    228      :input-cost 2.50
    229      :output-cost 10
    230      :cutoff-date "2024-05"))
    231   "List of available Gemini models and associated properties.
    232 Keys:
    233 
    234 - `:description': a brief description of the model.
    235 
    236 - `:capabilities': a list of capabilities supported by the model.
    237 
    238 - `:mime-types': a list of supported MIME types for media files.
    239 
    240 - `:context-window': the context window size, in thousands of tokens.
    241 
    242 - `:input-cost': the input cost, in US dollars per million tokens.
    243 
    244 - `:output-cost': the output cost, in US dollars per million tokens.
    245 
    246 - `:cutoff-date': the knowledge cutoff date.
    247 
    248 - `:request-params': a plist of additional request parameters to
    249   include when using this model.
    250 
    251 Information about the Gemini models was obtained from the following
    252 source:
    253 
    254 - <https://ai.google.dev/pricing>
    255 - <https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models>")
    256 
    257 ;;;###autoload
    258 (cl-defun gptel-make-gemini
    259     (name &key curl-args header key request-params
    260           (stream nil)
    261           (host "generativelanguage.googleapis.com")
    262           (protocol "https")
    263           (models gptel--gemini-models)
    264           (endpoint "/v1beta/models"))
    265 
    266   "Register a Gemini backend for gptel with NAME.
    267 
    268 Keyword arguments:
    269 
    270 CURL-ARGS (optional) is a list of additional Curl arguments.
    271 
    272 HOST (optional) is the API host, defaults to
    273 \"generativelanguage.googleapis.com\".
    274 
    275 MODELS is a list of available model names, as symbols.
    276 Additionally, you can specify supported LLM capabilities like
    277 vision or tool-use by appending a plist to the model with more
    278 information, in the form
    279 
    280  (model-name . plist)
    281 
    282 For a list of currently recognized plist keys, see
    283 `gptel--gemini-models'. An example of a model specification
    284 including both kinds of specs:
    285 
    286 :models
    287 \\='(gemini-pro                            ;Simple specs
    288   gemini-1.5-flash
    289   (gemini-1.5-pro-latest                ;Full spec
    290    :description
    291    \"Complex reasoning tasks, problem solving and data extraction\"
    292    :capabilities (tool json)
    293    :mime-types
    294    (\"image/jpeg\" \"image/png\" \"image/webp\" \"image/heic\")))
    295 
    296 
    297 STREAM is a boolean to enable streaming responses, defaults to
    298 false.
    299 
    300 PROTOCOL (optional) specifies the protocol, \"https\" by default.
    301 
    302 ENDPOINT (optional) is the API endpoint for completions, defaults to
    303 \"/v1beta/models\".
    304 
    305 HEADER (optional) is for additional headers to send with each
    306 request.  It should be an alist or a function that retuns an
    307 alist, like:
    308  ((\"Content-Type\" . \"application/json\"))
    309 
    310 KEY (optional) is a variable whose value is the API key, or
    311 function that returns the key.
    312 
    313 REQUEST-PARAMS (optional) is a plist of additional HTTP request
    314 parameters (as plist keys) and values supported by the API.  Use
    315 these to set parameters that gptel does not provide user options
    316 for."
    317   (declare (indent 1))
    318   (let ((backend (gptel--make-gemini
    319                   :curl-args curl-args
    320                   :name name
    321                   :host host
    322                   :header header
    323                   :models (gptel--process-models models)
    324                   :protocol protocol
    325                   :endpoint endpoint
    326                   :stream stream
    327                   :request-params request-params
    328                   :key key
    329                   :url (lambda ()
    330                          (let ((method (if (and stream
    331                                                 gptel-stream)
    332                                            "streamGenerateContent"
    333                                          "generateContent")))
    334                            (format "%s://%s%s/%s:%s?key=%s"
    335                                    protocol
    336                                    host
    337                                    endpoint
    338                                    gptel-model
    339                                    method
    340                                    (gptel--get-api-key)))))))
    341     (prog1 backend
    342       (setf (alist-get name gptel--known-backends
    343                        nil nil #'equal)
    344             backend))))
    345 
    346 (provide 'gptel-gemini)
    347 ;;; gptel-gemini.el ends here